34
34
35
35
/*
36
36
* longest - longest-preferred matching engine
37
+ *
38
+ * On success, returns match endpoint address. Returns NULL on no match.
39
+ * Internal errors also return NULL, with v->err set.
37
40
*/
38
- static chr * /* endpoint, or NULL */
39
- longest (struct vars * v , /* used only for debug and exec flags */
41
+ static chr *
42
+ longest (struct vars * v ,
40
43
struct dfa * d ,
41
44
chr * start , /* where the match should start */
42
45
chr * stop , /* match must end at or before here */
@@ -51,11 +54,15 @@ longest(struct vars * v, /* used only for debug and exec flags */
51
54
int i ;
52
55
struct colormap * cm = d -> cm ;
53
56
57
+ /* prevent "uninitialized variable" warnings */
58
+ if (hitstopp != NULL )
59
+ * hitstopp = 0 ;
60
+
54
61
/* initialize */
55
62
css = initialize (v , d , start );
63
+ if (css == NULL )
64
+ return NULL ;
56
65
cp = start ;
57
- if (hitstopp != NULL )
58
- * hitstopp = 0 ;
59
66
60
67
/* startup */
61
68
FDEBUG (("+++ startup +++\n" ));
@@ -74,8 +81,14 @@ longest(struct vars * v, /* used only for debug and exec flags */
74
81
return NULL ;
75
82
css -> lastseen = cp ;
76
83
77
- /* main loop */
84
+ /*
85
+ * This is the main text-scanning loop. It seems worth having two copies
86
+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87
+ * builds, when you're not actively tracing.
88
+ */
89
+ #ifdef REG_DEBUG
78
90
if (v -> eflags & REG_FTRACE )
91
+ {
79
92
while (cp < realstop )
80
93
{
81
94
FDEBUG (("+++ at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -92,7 +105,10 @@ longest(struct vars * v, /* used only for debug and exec flags */
92
105
ss -> lastseen = cp ;
93
106
css = ss ;
94
107
}
108
+ }
95
109
else
110
+ #endif
111
+ {
96
112
while (cp < realstop )
97
113
{
98
114
co = GETCOLOR (cm , * cp );
@@ -107,6 +123,10 @@ longest(struct vars * v, /* used only for debug and exec flags */
107
123
ss -> lastseen = cp ;
108
124
css = ss ;
109
125
}
126
+ }
127
+
128
+ if (ISERR ())
129
+ return NULL ;
110
130
111
131
/* shutdown */
112
132
FDEBUG (("+++ shutdown at c%d +++\n" , (int ) (css - d -> ssets )));
@@ -117,6 +137,8 @@ longest(struct vars * v, /* used only for debug and exec flags */
117
137
co = d -> cnfa -> eos [(v -> eflags & REG_NOTEOL ) ? 0 : 1 ];
118
138
FDEBUG (("color %ld\n" , (long ) co ));
119
139
ss = miss (v , d , css , co , cp , start );
140
+ if (ISERR ())
141
+ return NULL ;
120
142
/* special case: match ended at eol? */
121
143
if (ss != NULL && (ss -> flags & POSTSTATE ))
122
144
return cp ;
@@ -138,14 +160,17 @@ longest(struct vars * v, /* used only for debug and exec flags */
138
160
139
161
/*
140
162
* shortest - shortest-preferred matching engine
163
+ *
164
+ * On success, returns match endpoint address. Returns NULL on no match.
165
+ * Internal errors also return NULL, with v->err set.
141
166
*/
142
- static chr * /* endpoint, or NULL */
167
+ static chr *
143
168
shortest (struct vars * v ,
144
169
struct dfa * d ,
145
170
chr * start , /* where the match should start */
146
171
chr * min , /* match must end at or after here */
147
172
chr * max , /* match must end at or before here */
148
- chr * * coldp , /* store coldstart pointer here, if nonNULL */
173
+ chr * * coldp , /* store coldstart pointer here, if non-NULL */
149
174
int * hitstopp ) /* record whether hit v->stop, if non-NULL */
150
175
{
151
176
chr * cp ;
@@ -156,11 +181,17 @@ shortest(struct vars * v,
156
181
struct sset * ss ;
157
182
struct colormap * cm = d -> cm ;
158
183
184
+ /* prevent "uninitialized variable" warnings */
185
+ if (coldp != NULL )
186
+ * coldp = NULL ;
187
+ if (hitstopp != NULL )
188
+ * hitstopp = 0 ;
189
+
159
190
/* initialize */
160
191
css = initialize (v , d , start );
192
+ if (css == NULL )
193
+ return NULL ;
161
194
cp = start ;
162
- if (hitstopp != NULL )
163
- * hitstopp = 0 ;
164
195
165
196
/* startup */
166
197
FDEBUG (("--- startup ---\n" ));
@@ -180,8 +211,14 @@ shortest(struct vars * v,
180
211
css -> lastseen = cp ;
181
212
ss = css ;
182
213
183
- /* main loop */
214
+ /*
215
+ * This is the main text-scanning loop. It seems worth having two copies
216
+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217
+ * builds, when you're not actively tracing.
218
+ */
219
+ #ifdef REG_DEBUG
184
220
if (v -> eflags & REG_FTRACE )
221
+ {
185
222
while (cp < realmax )
186
223
{
187
224
FDEBUG (("--- at c%d ---\n" , (int ) (css - d -> ssets )));
@@ -200,7 +237,10 @@ shortest(struct vars * v,
200
237
if ((ss -> flags & POSTSTATE ) && cp >= realmin )
201
238
break ; /* NOTE BREAK OUT */
202
239
}
240
+ }
203
241
else
242
+ #endif
243
+ {
204
244
while (cp < realmax )
205
245
{
206
246
co = GETCOLOR (cm , * cp );
@@ -217,6 +257,7 @@ shortest(struct vars * v,
217
257
if ((ss -> flags & POSTSTATE ) && cp >= realmin )
218
258
break ; /* NOTE BREAK OUT */
219
259
}
260
+ }
220
261
221
262
if (ss == NULL )
222
263
return NULL ;
@@ -389,7 +430,7 @@ hash(unsigned *uv,
389
430
* initialize - hand-craft a cache entry for startup, otherwise get ready
390
431
*/
391
432
static struct sset *
392
- initialize (struct vars * v , /* used only for debug flags */
433
+ initialize (struct vars * v ,
393
434
struct dfa * d ,
394
435
chr * start )
395
436
{
@@ -402,6 +443,8 @@ initialize(struct vars * v, /* used only for debug flags */
402
443
else
403
444
{ /* no, must (re)build it */
404
445
ss = getvacant (v , d , start , start );
446
+ if (ss == NULL )
447
+ return NULL ;
405
448
for (i = 0 ; i < d -> wordsper ; i ++ )
406
449
ss -> states [i ] = 0 ;
407
450
BSET (ss -> states , d -> cnfa -> pre );
@@ -420,10 +463,20 @@ initialize(struct vars * v, /* used only for debug flags */
420
463
}
421
464
422
465
/*
423
- * miss - handle a cache miss
466
+ * miss - handle a stateset cache miss
467
+ *
468
+ * css is the current stateset, co is the color of the current input character,
469
+ * cp points to the character after that (which is where we may need to test
470
+ * LACONs). start does not affect matching behavior but is needed for pickss'
471
+ * heuristics about which stateset cache entry to replace.
472
+ *
473
+ * Ordinarily, returns the address of the next stateset (the one that is
474
+ * valid after consuming the input character). Returns NULL if no valid
475
+ * NFA states remain, ie we have a certain match failure.
476
+ * Internal errors also return NULL, with v->err set.
424
477
*/
425
- static struct sset * /* NULL if goes to empty set */
426
- miss (struct vars * v , /* used only for debug flags */
478
+ static struct sset *
479
+ miss (struct vars * v ,
427
480
struct dfa * d ,
428
481
struct sset * css ,
429
482
pcolor co ,
@@ -449,9 +502,23 @@ miss(struct vars * v, /* used only for debug flags */
449
502
}
450
503
FDEBUG (("miss\n" ));
451
504
452
- /* first, what set of states would we end up in? */
505
+ /*
506
+ * Checking for operation cancel in the inner text search loop seems
507
+ * unduly expensive. As a compromise, check during cache misses.
508
+ */
509
+ if (CANCEL_REQUESTED (v -> re ))
510
+ {
511
+ ERR (REG_CANCEL );
512
+ return NULL ;
513
+ }
514
+
515
+ /*
516
+ * What set of states would we end up in after consuming the co character?
517
+ * We first consider PLAIN arcs that consume the character, and then look
518
+ * to see what LACON arcs could be traversed after consuming it.
519
+ */
453
520
for (i = 0 ; i < d -> wordsper ; i ++ )
454
- d -> work [i ] = 0 ;
521
+ d -> work [i ] = 0 ; /* build new stateset bitmap in d->work */
455
522
ispost = 0 ;
456
523
noprogress = 1 ;
457
524
gotstate = 0 ;
@@ -468,22 +535,31 @@ miss(struct vars * v, /* used only for debug flags */
468
535
noprogress = 0 ;
469
536
FDEBUG (("%d -> %d\n" , i , ca -> to ));
470
537
}
471
- dolacons = (gotstate ) ? (cnfa -> flags & HASLACONS ) : 0 ;
538
+ if (!gotstate )
539
+ return NULL ; /* character cannot reach any new state */
540
+ dolacons = (cnfa -> flags & HASLACONS );
472
541
sawlacons = 0 ;
542
+ /* outer loop handles transitive closure of reachable-by-LACON states */
473
543
while (dolacons )
474
- { /* transitive closure */
544
+ {
475
545
dolacons = 0 ;
476
546
for (i = 0 ; i < d -> nstates ; i ++ )
477
547
if (ISBSET (d -> work , i ))
478
548
for (ca = cnfa -> states [i ]; ca -> co != COLORLESS ; ca ++ )
479
549
{
480
550
if (ca -> co < cnfa -> ncolors )
481
- continue ; /* NOTE CONTINUE */
482
- sawlacons = 1 ;
551
+ continue ; /* not a LACON arc */
483
552
if (ISBSET (d -> work , ca -> to ))
484
- continue ; /* NOTE CONTINUE */
553
+ continue ; /* arc would be a no-op anyway */
554
+ sawlacons = 1 ; /* this LACON affects our result */
485
555
if (!lacon (v , cnfa , cp , ca -> co ))
486
- continue ; /* NOTE CONTINUE */
556
+ {
557
+ if (ISERR ())
558
+ return NULL ;
559
+ continue ; /* LACON arc cannot be traversed */
560
+ }
561
+ if (ISERR ())
562
+ return NULL ;
487
563
BSET (d -> work , ca -> to );
488
564
dolacons = 1 ;
489
565
if (ca -> to == cnfa -> post )
@@ -493,11 +569,9 @@ miss(struct vars * v, /* used only for debug flags */
493
569
FDEBUG (("%d :> %d\n" , i , ca -> to ));
494
570
}
495
571
}
496
- if (!gotstate )
497
- return NULL ;
498
572
h = HASH (d -> work , d -> wordsper );
499
573
500
- /* next, is that in the cache? */
574
+ /* Is this stateset already in the cache? */
501
575
for (p = d -> ssets , i = d -> nssused ; i > 0 ; p ++ , i -- )
502
576
if (HIT (h , d -> work , p , d -> wordsper ))
503
577
{
@@ -507,6 +581,8 @@ miss(struct vars * v, /* used only for debug flags */
507
581
if (i == 0 )
508
582
{ /* nope, need a new cache entry */
509
583
p = getvacant (v , d , cp , start );
584
+ if (p == NULL )
585
+ return NULL ;
510
586
assert (p != css );
511
587
for (i = 0 ; i < d -> wordsper ; i ++ )
512
588
p -> states [i ] = d -> work [i ];
@@ -517,8 +593,15 @@ miss(struct vars * v, /* used only for debug flags */
517
593
/* lastseen to be dealt with by caller */
518
594
}
519
595
596
+ /*
597
+ * Link new stateset to old, unless a LACON affected the result, in which
598
+ * case we don't create the link. That forces future transitions across
599
+ * this same arc (same prior stateset and character color) to come through
600
+ * miss() again, so that we can recheck the LACON(s), which might or might
601
+ * not pass since context will be different.
602
+ */
520
603
if (!sawlacons )
521
- { /* lookahead conds. always cache miss */
604
+ {
522
605
FDEBUG (("c%d[%d]->c%d\n" ,
523
606
(int ) (css - d -> ssets ), co , (int ) (p - d -> ssets )));
524
607
css -> outs [co ] = p ;
@@ -562,11 +645,12 @@ lacon(struct vars * v,
562
645
563
646
/*
564
647
* getvacant - get a vacant state set
648
+ *
565
649
* This routine clears out the inarcs and outarcs, but does not otherwise
566
650
* clear the innards of the state set -- that's up to the caller.
567
651
*/
568
652
static struct sset *
569
- getvacant (struct vars * v , /* used only for debug flags */
653
+ getvacant (struct vars * v ,
570
654
struct dfa * d ,
571
655
chr * cp ,
572
656
chr * start )
@@ -578,6 +662,8 @@ getvacant(struct vars * v, /* used only for debug flags */
578
662
color co ;
579
663
580
664
ss = pickss (v , d , cp , start );
665
+ if (ss == NULL )
666
+ return NULL ;
581
667
assert (!(ss -> flags & LOCKED ));
582
668
583
669
/* clear out its inarcs, including self-referential ones */
@@ -635,7 +721,7 @@ getvacant(struct vars * v, /* used only for debug flags */
635
721
* pickss - pick the next stateset to be used
636
722
*/
637
723
static struct sset *
638
- pickss (struct vars * v , /* used only for debug flags */
724
+ pickss (struct vars * v ,
639
725
struct dfa * d ,
640
726
chr * cp ,
641
727
chr * start )
@@ -691,7 +777,6 @@ pickss(struct vars * v, /* used only for debug flags */
691
777
692
778
/* nobody's old enough?!? -- something's really wrong */
693
779
FDEBUG (("cannot find victim to replace!\n" ));
694
- assert (NOTREACHED );
695
780
ERR (REG_ASSERT );
696
- return d -> ssets ;
781
+ return NULL ;
697
782
}
0 commit comments