Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit cb76fbd

Browse files
committed
Make regexp engine's backref-related compilation state more bulletproof.
Up to now, we remembered the definition of a capturing parenthesis subexpression by storing a pointer to the associated subRE node. That was okay before, because that subRE didn't get modified anymore while parsing the rest of the regexp. However, in the wake of commit ea1268f, that's no longer true: the outer invocation of parseqatom() feels free to scribble on that subRE. This seems to work anyway, because the states we jam into the child atom in the "prepare a general-purpose state skeleton" stanza aren't really semantically different from the original endpoints of the child atom. But that would be mighty easy to break, and it's definitely not how things worked before. Between this and the issue fixed in the prior commit, it seems best to get rid of this dependence on subRE nodes entirely. We don't need the whole child subRE for future backrefs, only its starting and ending NFA states; so let's just store pointers to those. Also, in the corner case where we make an extra subRE to handle immediately-nested capturing parentheses, it seems like it'd be smart to have the extra subRE have the same begin/end states as the original child subRE does (s/s2 not lp/rp). I think that linking it from lp to rp might actually be semantically wrong, though since Spencer's original code did it that way, I'm not totally certain. Using s/s2 is certainly not wrong, in any case. Per report from Mark Dilger. Back-patch to v14 where the problematic patches came in. Discussion: https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com
1 parent cc18687 commit cb76fbd

File tree

1 file changed

+35
-22
lines changed

1 file changed

+35
-22
lines changed

src/backend/regex/regcomp.c

+35-22
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,13 @@ static int cmp(const chr *, const chr *, size_t);
233233
static int casecmp(const chr *, const chr *, size_t);
234234

235235

236+
/* info we need during compilation about a known capturing subexpression */
237+
struct subinfo
238+
{
239+
struct state *left; /* left end of its sub-NFA */
240+
struct state *right; /* right end of its sub-NFA */
241+
};
242+
236243
/* internal variables, bundled for easy passing around */
237244
struct vars
238245
{
@@ -245,10 +252,10 @@ struct vars
245252
int nexttype; /* type of next token */
246253
chr nextvalue; /* value (if any) of next token */
247254
int lexcon; /* lexical context type (see regc_lex.c) */
248-
int nsubexp; /* subexpression count */
249-
struct subre **subs; /* subRE pointer vector */
250-
size_t nsubs; /* length of vector */
251-
struct subre *sub10[10]; /* initial vector, enough for most */
255+
int nsubexp; /* number of known capturing subexpressions */
256+
struct subinfo *subs; /* info about known capturing subexpressions */
257+
size_t nsubs; /* allocated length of subs[] vector */
258+
struct subinfo sub10[10]; /* initial vector, enough for most */
252259
struct nfa *nfa; /* the NFA */
253260
struct colormap *cm; /* character color map */
254261
color nlcolor; /* color of newline */
@@ -368,7 +375,7 @@ pg_regcomp(regex_t *re,
368375
v->subs = v->sub10;
369376
v->nsubs = 10;
370377
for (j = 0; j < v->nsubs; j++)
371-
v->subs[j] = NULL;
378+
v->subs[j].left = v->subs[j].right = NULL;
372379
v->nfa = NULL;
373380
v->cm = NULL;
374381
v->nlcolor = COLORLESS;
@@ -504,35 +511,35 @@ pg_regcomp(regex_t *re,
504511
}
505512

506513
/*
507-
* moresubs - enlarge subRE vector
514+
* moresubs - enlarge capturing-subexpressions vector
508515
*/
509516
static void
510517
moresubs(struct vars *v,
511518
int wanted) /* want enough room for this one */
512519
{
513-
struct subre **p;
520+
struct subinfo *p;
514521
size_t n;
515522

516523
assert(wanted > 0 && (size_t) wanted >= v->nsubs);
517524
n = (size_t) wanted * 3 / 2 + 1;
518525

519526
if (v->subs == v->sub10)
520527
{
521-
p = (struct subre **) MALLOC(n * sizeof(struct subre *));
528+
p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo));
522529
if (p != NULL)
523530
memcpy(VS(p), VS(v->subs),
524-
v->nsubs * sizeof(struct subre *));
531+
v->nsubs * sizeof(struct subinfo));
525532
}
526533
else
527-
p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
534+
p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo));
528535
if (p == NULL)
529536
{
530537
ERR(REG_ESPACE);
531538
return;
532539
}
533540
v->subs = p;
534541
for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
535-
*p = NULL;
542+
p->left = p->right = NULL;
536543
assert(v->nsubs == n);
537544
assert((size_t) wanted < v->nsubs);
538545
}
@@ -969,10 +976,14 @@ parseqatom(struct vars *v,
969976
NEXT();
970977

971978
/*
972-
* Make separate endpoints to ensure we keep this sub-NFA cleanly
973-
* separate from what surrounds it. We need to be sure that when
974-
* we duplicate the sub-NFA for a backref, we get the right states
975-
* and no others.
979+
* Make separate endpoint states to keep this sub-NFA distinct
980+
* from what surrounds it. We need to be sure that when we
981+
* duplicate the sub-NFA for a backref, we get the right
982+
* states/arcs and no others. In particular, letting a backref
983+
* duplicate the sub-NFA from lp to rp would be quite wrong,
984+
* because we may add quantification superstructure around this
985+
* atom below. (Perhaps we could skip the extra states for
986+
* non-capturing parens, but it seems not worth the trouble.)
976987
*/
977988
s = newstate(v->nfa);
978989
s2 = newstate(v->nfa);
@@ -986,8 +997,10 @@ parseqatom(struct vars *v,
986997
NOERRN();
987998
if (cap)
988999
{
989-
assert(v->subs[subno] == NULL);
990-
v->subs[subno] = atom;
1000+
/* save the sub-NFA's endpoints for future backrefs to use */
1001+
assert(v->subs[subno].left == NULL);
1002+
v->subs[subno].left = s;
1003+
v->subs[subno].right = s2;
9911004
if (atom->capno == 0)
9921005
{
9931006
/* normal case: just mark the atom as capturing */
@@ -997,7 +1010,7 @@ parseqatom(struct vars *v,
9971010
else
9981011
{
9991012
/* generate no-op wrapper node to handle "((x))" */
1000-
t = subre(v, '(', atom->flags | CAP, lp, rp);
1013+
t = subre(v, '(', atom->flags | CAP, s, s2);
10011014
NOERRN();
10021015
t->capno = subno;
10031016
t->child = atom;
@@ -1009,7 +1022,7 @@ parseqatom(struct vars *v,
10091022
case BACKREF: /* the Feature From The Black Lagoon */
10101023
INSIST(type != LACON, REG_ESUBREG);
10111024
INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
1012-
INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
1025+
INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG);
10131026
NOERRN();
10141027
assert(v->nextvalue > 0);
10151028
atom = subre(v, 'b', BACKR, lp, rp);
@@ -1084,7 +1097,7 @@ parseqatom(struct vars *v,
10841097
if (atom != NULL)
10851098
freesubre(v, atom);
10861099
if (atomtype == '(')
1087-
v->subs[subno] = NULL;
1100+
v->subs[subno].left = v->subs[subno].right = NULL;
10881101
delsub(v->nfa, lp, rp);
10891102
EMPTYARC(lp, rp);
10901103
return top;
@@ -1177,14 +1190,14 @@ parseqatom(struct vars *v,
11771190
{
11781191
assert(atom->begin->nouts == 1); /* just the EMPTY */
11791192
delsub(v->nfa, atom->begin, atom->end);
1180-
assert(v->subs[subno] != NULL);
1193+
assert(v->subs[subno].left != NULL);
11811194

11821195
/*
11831196
* And here's why the recursion got postponed: it must wait until the
11841197
* skeleton is filled in, because it may hit a backref that wants to
11851198
* copy the filled-in skeleton.
11861199
*/
1187-
dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1200+
dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right,
11881201
atom->begin, atom->end);
11891202
NOERRN();
11901203

0 commit comments

Comments
 (0)