Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 9fe8fe9

Browse files
committed
Add some more query-cancel checks to regular expression matching.
Commit 9662143 added infrastructure to allow regular-expression operations to be terminated early in the event of SIGINT etc. However, fuzz testing by Greg Stark disclosed that there are still cases where regex compilation could run for a long time without noticing a cancel request. Specifically, the fixempties() phase never adds new states, only new arcs, so it doesn't hit the cancel check I'd put in newstate(). Add one to newarc() as well to cover that. Some experimentation of my own found that regex execution could also run for a long time despite a pending cancel. We'd put a high-level cancel check into cdissect(), but there was none inside the core text-matching routines longest() and shortest(). Ordinarily those inner loops are very very fast ... but in the presence of lookahead constraints, not so much. As a compromise, stick a cancel check into the stateset cache-miss function, which is enough to guarantee a cancel check at least once per lookahead constraint test. Making this work required more attention to error handling throughout the regex executor. Henry Spencer had apparently originally intended longest() and shortest() to be incapable of incurring errors while running, so neither they nor their subroutines had well-defined error reporting behaviors. However, that was already broken by the lookahead constraint feature, since lacon() can surely suffer an out-of-memory failure --- which, in the code as it stood, might never be reported to the user at all, but just silently be treated as a non-match of the lookahead constraint. Normalize all that by inserting explicit error tests as needed. I took the opportunity to add some more comments to the code, too. Back-patch to all supported branches, like the previous patch.
1 parent 558d4ad commit 9fe8fe9

File tree

3 files changed

+154
-31
lines changed

3 files changed

+154
-31
lines changed

src/backend/regex/regc_nfa.c

+12-1
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ newstate(struct nfa * nfa)
180180
/*
181181
* This is a handy place to check for operation cancel during regex
182182
* compilation, since no code path will go very long without making a new
183-
* state.
183+
* state or arc.
184184
*/
185185
if (CANCEL_REQUESTED(nfa->v->re))
186186
{
@@ -333,6 +333,17 @@ newarc(struct nfa * nfa,
333333

334334
assert(from != NULL && to != NULL);
335335

336+
/*
337+
* This is a handy place to check for operation cancel during regex
338+
* compilation, since no code path will go very long without making a new
339+
* state or arc.
340+
*/
341+
if (CANCEL_REQUESTED(nfa->v->re))
342+
{
343+
NERR(REG_CANCEL);
344+
return;
345+
}
346+
336347
/* check for duplicates */
337348
for (a = from->outs; a != NULL; a = a->outchain)
338349
if (a->to == to && a->co == co && a->type == t)

src/backend/regex/rege_dfa.c

+115-30
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,12 @@
3434

3535
/*
3636
* longest - longest-preferred matching engine
37+
*
38+
* On success, returns match endpoint address. Returns NULL on no match.
39+
* Internal errors also return NULL, with v->err set.
3740
*/
38-
static chr * /* endpoint, or NULL */
39-
longest(struct vars * v, /* used only for debug and exec flags */
41+
static chr *
42+
longest(struct vars * v,
4043
struct dfa * d,
4144
chr *start, /* where the match should start */
4245
chr *stop, /* match must end at or before here */
@@ -51,11 +54,15 @@ longest(struct vars * v, /* used only for debug and exec flags */
5154
int i;
5255
struct colormap *cm = d->cm;
5356

57+
/* prevent "uninitialized variable" warnings */
58+
if (hitstopp != NULL)
59+
*hitstopp = 0;
60+
5461
/* initialize */
5562
css = initialize(v, d, start);
63+
if (css == NULL)
64+
return NULL;
5665
cp = start;
57-
if (hitstopp != NULL)
58-
*hitstopp = 0;
5966

6067
/* startup */
6168
FDEBUG(("+++ startup +++\n"));
@@ -74,8 +81,14 @@ longest(struct vars * v, /* used only for debug and exec flags */
7481
return NULL;
7582
css->lastseen = cp;
7683

77-
/* main loop */
84+
/*
85+
* This is the main text-scanning loop. It seems worth having two copies
86+
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87+
* builds, when you're not actively tracing.
88+
*/
89+
#ifdef REG_DEBUG
7890
if (v->eflags & REG_FTRACE)
91+
{
7992
while (cp < realstop)
8093
{
8194
FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
@@ -92,7 +105,10 @@ longest(struct vars * v, /* used only for debug and exec flags */
92105
ss->lastseen = cp;
93106
css = ss;
94107
}
108+
}
95109
else
110+
#endif
111+
{
96112
while (cp < realstop)
97113
{
98114
co = GETCOLOR(cm, *cp);
@@ -107,6 +123,10 @@ longest(struct vars * v, /* used only for debug and exec flags */
107123
ss->lastseen = cp;
108124
css = ss;
109125
}
126+
}
127+
128+
if (ISERR())
129+
return NULL;
110130

111131
/* shutdown */
112132
FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
@@ -117,6 +137,8 @@ longest(struct vars * v, /* used only for debug and exec flags */
117137
co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
118138
FDEBUG(("color %ld\n", (long) co));
119139
ss = miss(v, d, css, co, cp, start);
140+
if (ISERR())
141+
return NULL;
120142
/* special case: match ended at eol? */
121143
if (ss != NULL && (ss->flags & POSTSTATE))
122144
return cp;
@@ -138,14 +160,17 @@ longest(struct vars * v, /* used only for debug and exec flags */
138160

139161
/*
140162
* shortest - shortest-preferred matching engine
163+
*
164+
* On success, returns match endpoint address. Returns NULL on no match.
165+
* Internal errors also return NULL, with v->err set.
141166
*/
142-
static chr * /* endpoint, or NULL */
167+
static chr *
143168
shortest(struct vars * v,
144169
struct dfa * d,
145170
chr *start, /* where the match should start */
146171
chr *min, /* match must end at or after here */
147172
chr *max, /* match must end at or before here */
148-
chr **coldp, /* store coldstart pointer here, if nonNULL */
173+
chr **coldp, /* store coldstart pointer here, if non-NULL */
149174
int *hitstopp) /* record whether hit v->stop, if non-NULL */
150175
{
151176
chr *cp;
@@ -156,11 +181,17 @@ shortest(struct vars * v,
156181
struct sset *ss;
157182
struct colormap *cm = d->cm;
158183

184+
/* prevent "uninitialized variable" warnings */
185+
if (coldp != NULL)
186+
*coldp = NULL;
187+
if (hitstopp != NULL)
188+
*hitstopp = 0;
189+
159190
/* initialize */
160191
css = initialize(v, d, start);
192+
if (css == NULL)
193+
return NULL;
161194
cp = start;
162-
if (hitstopp != NULL)
163-
*hitstopp = 0;
164195

165196
/* startup */
166197
FDEBUG(("--- startup ---\n"));
@@ -180,8 +211,14 @@ shortest(struct vars * v,
180211
css->lastseen = cp;
181212
ss = css;
182213

183-
/* main loop */
214+
/*
215+
* This is the main text-scanning loop. It seems worth having two copies
216+
* to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217+
* builds, when you're not actively tracing.
218+
*/
219+
#ifdef REG_DEBUG
184220
if (v->eflags & REG_FTRACE)
221+
{
185222
while (cp < realmax)
186223
{
187224
FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
@@ -200,7 +237,10 @@ shortest(struct vars * v,
200237
if ((ss->flags & POSTSTATE) && cp >= realmin)
201238
break; /* NOTE BREAK OUT */
202239
}
240+
}
203241
else
242+
#endif
243+
{
204244
while (cp < realmax)
205245
{
206246
co = GETCOLOR(cm, *cp);
@@ -217,6 +257,7 @@ shortest(struct vars * v,
217257
if ((ss->flags & POSTSTATE) && cp >= realmin)
218258
break; /* NOTE BREAK OUT */
219259
}
260+
}
220261

221262
if (ss == NULL)
222263
return NULL;
@@ -389,7 +430,7 @@ hash(unsigned *uv,
389430
* initialize - hand-craft a cache entry for startup, otherwise get ready
390431
*/
391432
static struct sset *
392-
initialize(struct vars * v, /* used only for debug flags */
433+
initialize(struct vars * v,
393434
struct dfa * d,
394435
chr *start)
395436
{
@@ -402,6 +443,8 @@ initialize(struct vars * v, /* used only for debug flags */
402443
else
403444
{ /* no, must (re)build it */
404445
ss = getvacant(v, d, start, start);
446+
if (ss == NULL)
447+
return NULL;
405448
for (i = 0; i < d->wordsper; i++)
406449
ss->states[i] = 0;
407450
BSET(ss->states, d->cnfa->pre);
@@ -420,10 +463,20 @@ initialize(struct vars * v, /* used only for debug flags */
420463
}
421464

422465
/*
423-
* miss - handle a cache miss
466+
* miss - handle a stateset cache miss
467+
*
468+
* css is the current stateset, co is the color of the current input character,
469+
* cp points to the character after that (which is where we may need to test
470+
* LACONs). start does not affect matching behavior but is needed for pickss'
471+
* heuristics about which stateset cache entry to replace.
472+
*
473+
* Ordinarily, returns the address of the next stateset (the one that is
474+
* valid after consuming the input character). Returns NULL if no valid
475+
* NFA states remain, ie we have a certain match failure.
476+
* Internal errors also return NULL, with v->err set.
424477
*/
425-
static struct sset * /* NULL if goes to empty set */
426-
miss(struct vars * v, /* used only for debug flags */
478+
static struct sset *
479+
miss(struct vars * v,
427480
struct dfa * d,
428481
struct sset * css,
429482
pcolor co,
@@ -449,9 +502,23 @@ miss(struct vars * v, /* used only for debug flags */
449502
}
450503
FDEBUG(("miss\n"));
451504

452-
/* first, what set of states would we end up in? */
505+
/*
506+
* Checking for operation cancel in the inner text search loop seems
507+
* unduly expensive. As a compromise, check during cache misses.
508+
*/
509+
if (CANCEL_REQUESTED(v->re))
510+
{
511+
ERR(REG_CANCEL);
512+
return NULL;
513+
}
514+
515+
/*
516+
* What set of states would we end up in after consuming the co character?
517+
* We first consider PLAIN arcs that consume the character, and then look
518+
* to see what LACON arcs could be traversed after consuming it.
519+
*/
453520
for (i = 0; i < d->wordsper; i++)
454-
d->work[i] = 0;
521+
d->work[i] = 0; /* build new stateset bitmap in d->work */
455522
ispost = 0;
456523
noprogress = 1;
457524
gotstate = 0;
@@ -468,22 +535,31 @@ miss(struct vars * v, /* used only for debug flags */
468535
noprogress = 0;
469536
FDEBUG(("%d -> %d\n", i, ca->to));
470537
}
471-
dolacons = (gotstate) ? (cnfa->flags & HASLACONS) : 0;
538+
if (!gotstate)
539+
return NULL; /* character cannot reach any new state */
540+
dolacons = (cnfa->flags & HASLACONS);
472541
sawlacons = 0;
542+
/* outer loop handles transitive closure of reachable-by-LACON states */
473543
while (dolacons)
474-
{ /* transitive closure */
544+
{
475545
dolacons = 0;
476546
for (i = 0; i < d->nstates; i++)
477547
if (ISBSET(d->work, i))
478548
for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
479549
{
480550
if (ca->co < cnfa->ncolors)
481-
continue; /* NOTE CONTINUE */
482-
sawlacons = 1;
551+
continue; /* not a LACON arc */
483552
if (ISBSET(d->work, ca->to))
484-
continue; /* NOTE CONTINUE */
553+
continue; /* arc would be a no-op anyway */
554+
sawlacons = 1; /* this LACON affects our result */
485555
if (!lacon(v, cnfa, cp, ca->co))
486-
continue; /* NOTE CONTINUE */
556+
{
557+
if (ISERR())
558+
return NULL;
559+
continue; /* LACON arc cannot be traversed */
560+
}
561+
if (ISERR())
562+
return NULL;
487563
BSET(d->work, ca->to);
488564
dolacons = 1;
489565
if (ca->to == cnfa->post)
@@ -493,11 +569,9 @@ miss(struct vars * v, /* used only for debug flags */
493569
FDEBUG(("%d :> %d\n", i, ca->to));
494570
}
495571
}
496-
if (!gotstate)
497-
return NULL;
498572
h = HASH(d->work, d->wordsper);
499573

500-
/* next, is that in the cache? */
574+
/* Is this stateset already in the cache? */
501575
for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
502576
if (HIT(h, d->work, p, d->wordsper))
503577
{
@@ -507,6 +581,8 @@ miss(struct vars * v, /* used only for debug flags */
507581
if (i == 0)
508582
{ /* nope, need a new cache entry */
509583
p = getvacant(v, d, cp, start);
584+
if (p == NULL)
585+
return NULL;
510586
assert(p != css);
511587
for (i = 0; i < d->wordsper; i++)
512588
p->states[i] = d->work[i];
@@ -517,8 +593,15 @@ miss(struct vars * v, /* used only for debug flags */
517593
/* lastseen to be dealt with by caller */
518594
}
519595

596+
/*
597+
* Link new stateset to old, unless a LACON affected the result, in which
598+
* case we don't create the link. That forces future transitions across
599+
* this same arc (same prior stateset and character color) to come through
600+
* miss() again, so that we can recheck the LACON(s), which might or might
601+
* not pass since context will be different.
602+
*/
520603
if (!sawlacons)
521-
{ /* lookahead conds. always cache miss */
604+
{
522605
FDEBUG(("c%d[%d]->c%d\n",
523606
(int) (css - d->ssets), co, (int) (p - d->ssets)));
524607
css->outs[co] = p;
@@ -562,11 +645,12 @@ lacon(struct vars * v,
562645

563646
/*
564647
* getvacant - get a vacant state set
648+
*
565649
* This routine clears out the inarcs and outarcs, but does not otherwise
566650
* clear the innards of the state set -- that's up to the caller.
567651
*/
568652
static struct sset *
569-
getvacant(struct vars * v, /* used only for debug flags */
653+
getvacant(struct vars * v,
570654
struct dfa * d,
571655
chr *cp,
572656
chr *start)
@@ -578,6 +662,8 @@ getvacant(struct vars * v, /* used only for debug flags */
578662
color co;
579663

580664
ss = pickss(v, d, cp, start);
665+
if (ss == NULL)
666+
return NULL;
581667
assert(!(ss->flags & LOCKED));
582668

583669
/* clear out its inarcs, including self-referential ones */
@@ -635,7 +721,7 @@ getvacant(struct vars * v, /* used only for debug flags */
635721
* pickss - pick the next stateset to be used
636722
*/
637723
static struct sset *
638-
pickss(struct vars * v, /* used only for debug flags */
724+
pickss(struct vars * v,
639725
struct dfa * d,
640726
chr *cp,
641727
chr *start)
@@ -691,7 +777,6 @@ pickss(struct vars * v, /* used only for debug flags */
691777

692778
/* nobody's old enough?!? -- something's really wrong */
693779
FDEBUG(("cannot find victim to replace!\n"));
694-
assert(NOTREACHED);
695780
ERR(REG_ASSERT);
696-
return d->ssets;
781+
return NULL;
697782
}

0 commit comments

Comments
 (0)