Make regexp engine's backref-related compilation state more bulletproof.

tglsfdc · tglsfdc · commit cb76fbd7ec87 · 2021-08-07T22:27:13.000-04:00
Up to now, we remembered the definition of a capturing parenthesis subexpression by storing a pointer to the associated subRE node. That was okay before, because that subRE didn't get modified anymore while parsing the rest of the regexp. However, in the wake of commit ea1268f, that's no longer true: the outer invocation of parseqatom() feels free to scribble on that subRE. This seems to work anyway, because the states we jam into the child atom in the "prepare a general-purpose state skeleton" stanza aren't really semantically different from the original endpoints of the child atom. But that would be mighty easy to break, and it's definitely not how things worked before. Between this and the issue fixed in the prior commit, it seems best to get rid of this dependence on subRE nodes entirely. We don't need the whole child subRE for future backrefs, only its starting and ending NFA states; so let's just store pointers to those. Also, in the corner case where we make an extra subRE to handle immediately-nested capturing parentheses, it seems like it'd be smart to have the extra subRE have the same begin/end states as the original child subRE does (s/s2 not lp/rp). I think that linking it from lp to rp might actually be semantically wrong, though since Spencer's original code did it that way, I'm not totally certain. Using s/s2 is certainly not wrong, in any case. Per report from Mark Dilger. Back-patch to v14 where the problematic patches came in. Discussion: https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
@@ -233,6 +233,13 @@ static int	cmp(const chr *, const chr *, size_t);
 static int	casecmp(const chr *, const chr *, size_t);
 
 
+/* info we need during compilation about a known capturing subexpression */
+struct subinfo
+{
+	struct state *left;			/* left end of its sub-NFA */
+	struct state *right;		/* right end of its sub-NFA */
+};
+
 /* internal variables, bundled for easy passing around */
 struct vars
 {
@@ -245,10 +252,10 @@ struct vars
 	int			nexttype;		/* type of next token */
 	chr			nextvalue;		/* value (if any) of next token */
 	int			lexcon;			/* lexical context type (see regc_lex.c) */
-	int			nsubexp;		/* subexpression count */
-	struct subre **subs;		/* subRE pointer vector */
-	size_t		nsubs;			/* length of vector */
-	struct subre *sub10[10];	/* initial vector, enough for most */
+	int			nsubexp;		/* number of known capturing subexpressions */
+	struct subinfo *subs;		/* info about known capturing subexpressions */
+	size_t		nsubs;			/* allocated length of subs[] vector */
+	struct subinfo sub10[10];	/* initial vector, enough for most */
 	struct nfa *nfa;			/* the NFA */
 	struct colormap *cm;		/* character color map */
 	color		nlcolor;		/* color of newline */
@@ -368,7 +375,7 @@ pg_regcomp(regex_t *re,
 	v->subs = v->sub10;
 	v->nsubs = 10;
 	for (j = 0; j < v->nsubs; j++)
-		v->subs[j] = NULL;
+		v->subs[j].left = v->subs[j].right = NULL;
 	v->nfa = NULL;
 	v->cm = NULL;
 	v->nlcolor = COLORLESS;
@@ -504,35 +511,35 @@ pg_regcomp(regex_t *re,
 }
 
 /*
- * moresubs - enlarge subRE vector
+ * moresubs - enlarge capturing-subexpressions vector
  */
 static void
 moresubs(struct vars *v,
 		 int wanted)			/* want enough room for this one */
 {
-	struct subre **p;
+	struct subinfo *p;
 	size_t		n;
 
 	assert(wanted > 0 && (size_t) wanted >= v->nsubs);
 	n = (size_t) wanted * 3 / 2 + 1;
 
 	if (v->subs == v->sub10)
 	{
-		p = (struct subre **) MALLOC(n * sizeof(struct subre *));
+		p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo));
 		if (p != NULL)
 			memcpy(VS(p), VS(v->subs),
-				   v->nsubs * sizeof(struct subre *));
+				   v->nsubs * sizeof(struct subinfo));
 	}
 	else
-		p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
+		p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo));
 	if (p == NULL)
 	{
 		ERR(REG_ESPACE);
 		return;
 	}
 	v->subs = p;
 	for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
-		*p = NULL;
+		p->left = p->right = NULL;
 	assert(v->nsubs == n);
 	assert((size_t) wanted < v->nsubs);
 }
@@ -969,10 +976,14 @@ parseqatom(struct vars *v,
 			NEXT();
 
 			/*
-			 * Make separate endpoints to ensure we keep this sub-NFA cleanly
-			 * separate from what surrounds it.  We need to be sure that when
-			 * we duplicate the sub-NFA for a backref, we get the right states
-			 * and no others.
+			 * Make separate endpoint states to keep this sub-NFA distinct
+			 * from what surrounds it.  We need to be sure that when we
+			 * duplicate the sub-NFA for a backref, we get the right
+			 * states/arcs and no others.  In particular, letting a backref
+			 * duplicate the sub-NFA from lp to rp would be quite wrong,
+			 * because we may add quantification superstructure around this
+			 * atom below.  (Perhaps we could skip the extra states for
+			 * non-capturing parens, but it seems not worth the trouble.)
 			 */
 			s = newstate(v->nfa);
 			s2 = newstate(v->nfa);
@@ -986,8 +997,10 @@ parseqatom(struct vars *v,
 			NOERRN();
 			if (cap)
 			{
-				assert(v->subs[subno] == NULL);
-				v->subs[subno] = atom;
+				/* save the sub-NFA's endpoints for future backrefs to use */
+				assert(v->subs[subno].left == NULL);
+				v->subs[subno].left = s;
+				v->subs[subno].right = s2;
 				if (atom->capno == 0)
 				{
 					/* normal case: just mark the atom as capturing */
@@ -997,7 +1010,7 @@ parseqatom(struct vars *v,
 				else
 				{
 					/* generate no-op wrapper node to handle "((x))" */
-					t = subre(v, '(', atom->flags | CAP, lp, rp);
+					t = subre(v, '(', atom->flags | CAP, s, s2);
 					NOERRN();
 					t->capno = subno;
 					t->child = atom;
@@ -1009,7 +1022,7 @@ parseqatom(struct vars *v,
 		case BACKREF:			/* the Feature From The Black Lagoon */
 			INSIST(type != LACON, REG_ESUBREG);
 			INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
-			INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+			INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG);
 			NOERRN();
 			assert(v->nextvalue > 0);
 			atom = subre(v, 'b', BACKR, lp, rp);
@@ -1084,7 +1097,7 @@ parseqatom(struct vars *v,
 		if (atom != NULL)
 			freesubre(v, atom);
 		if (atomtype == '(')
-			v->subs[subno] = NULL;
+			v->subs[subno].left = v->subs[subno].right = NULL;
 		delsub(v->nfa, lp, rp);
 		EMPTYARC(lp, rp);
 		return top;
@@ -1177,14 +1190,14 @@ parseqatom(struct vars *v,
 	{
 		assert(atom->begin->nouts == 1);	/* just the EMPTY */
 		delsub(v->nfa, atom->begin, atom->end);
-		assert(v->subs[subno] != NULL);
+		assert(v->subs[subno].left != NULL);
 
 		/*
 		 * And here's why the recursion got postponed: it must wait until the
 		 * skeleton is filled in, because it may hit a backref that wants to
 		 * copy the filled-in skeleton.
 		 */
-		dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
+		dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right,
 			   atom->begin, atom->end);
 		NOERRN();