4 files changed, 67 insertions, 30 deletions
diff --git a/src/backend/regex/README b/src/backend/regex/README
index cafeb3dffb0..e4b083664f2 100644
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -130,8 +130,8 @@ possibilities.
 
 As an example, consider the regex "(a[bc]+)\1".  The compiled
 representation will have a top-level concatenation subre node.  Its first
-child is a capture node, and the child of that is a plain DFA node for
-"a[bc]+".  The concatenation's second child is a backref node for \1.
+child is a plain DFA node for "a[bc]+" (which is marked as being a capture
+node).  The concatenation's second child is a backref node for \1.
 The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
 where the backref has been replaced by a copy of the DFA for its referent
 expression.  When executed, the concatenation node will have to search for
@@ -147,6 +147,17 @@ run much faster than a pure NFA engine could do.  It is this behavior that
 justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's
 library.
 
+It's perhaps worth noting that separate capture subre nodes are a rarity:
+normally, we just mark a subre as capturing and that's it.  However, it's
+legal to write a regex like "((x))" in which the same substring has to be
+captured by multiple sets of parentheses.  Since a subre has room for only
+one "capno" field, a single subre can't handle that.  We handle such cases
+by wrapping the base subre (which captures the innermost parens) in a
+no-op capture node, or even more than one for "(((x)))" etc.  This is a
+little bit inefficient because we end up with multiple identical NFAs,
+but since the case is pointless and infrequent, it's not worth working
+harder.
+
 
 Colors and colormapping
 -----------------------
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index c0680b280bc..0cd4b4c4c29 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -452,7 +452,7 @@ pg_regcomp(regex_t *re,
 #endif
 
 		/* Prepend .* to pattern if it's a lookbehind LACON */
-		nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
+		nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->latype), debug);
 	}
 	CNOERR();
 	if (v->tree->flags & SHORTER)
@@ -944,7 +944,13 @@ parseqatom(struct vars *v,
 			else
 				atomtype = PLAIN;	/* something that's not '(' */
 			NEXT();
-			/* need new endpoints because tree will contain pointers */
+
+			/*
+			 * Make separate endpoints to ensure we keep this sub-NFA cleanly
+			 * separate from what surrounds it.  We need to be sure that when
+			 * we duplicate the sub-NFA for a backref, we get the right states
+			 * and no others.
+			 */
 			s = newstate(v->nfa);
 			s2 = newstate(v->nfa);
 			NOERR();
@@ -959,11 +965,21 @@ parseqatom(struct vars *v,
 			{
 				assert(v->subs[subno] == NULL);
 				v->subs[subno] = atom;
-				t = subre(v, '(', atom->flags | CAP, lp, rp);
-				NOERR();
-				t->subno = subno;
-				t->child = atom;
-				atom = t;
+				if (atom->capno == 0)
+				{
+					/* normal case: just mark the atom as capturing */
+					atom->flags |= CAP;
+					atom->capno = subno;
+				}
+				else
+				{
+					/* generate no-op wrapper node to handle "((x))" */
+					t = subre(v, '(', atom->flags | CAP, lp, rp);
+					NOERR();
+					t->capno = subno;
+					t->child = atom;
+					atom = t;
+				}
 			}
 			/* postpone everything else pending possible {0} */
 			break;
@@ -976,7 +992,7 @@ parseqatom(struct vars *v,
 			atom = subre(v, 'b', BACKR, lp, rp);
 			NOERR();
 			subno = v->nextvalue;
-			atom->subno = subno;
+			atom->backno = subno;
 			EMPTYARC(lp, rp);	/* temporarily, so there's something */
 			NEXT();
 			break;
@@ -1276,8 +1292,10 @@ parseqatom(struct vars *v,
 			freesubre(v, top->child);
 			top->op = t->op;
 			top->flags = t->flags;
+			top->latype = t->latype;
 			top->id = t->id;
-			top->subno = t->subno;
+			top->capno = t->capno;
+			top->backno = t->backno;
 			top->min = t->min;
 			top->max = t->max;
 			top->child = t->child;
@@ -1790,8 +1808,10 @@ subre(struct vars *v,
 
 	ret->op = op;
 	ret->flags = flags;
+	ret->latype = (char) -1;
 	ret->id = 0;				/* will be assigned later */
-	ret->subno = 0;
+	ret->capno = 0;
+	ret->backno = 0;
 	ret->min = ret->max = 1;
 	ret->child = NULL;
 	ret->sibling = NULL;
@@ -1893,7 +1913,7 @@ numst(struct subre *t,
 	assert(t != NULL);
 
 	i = start;
-	t->id = (short) i++;
+	t->id = i++;
 	for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
 		i = numst(t2, i);
 	return i;
@@ -2040,7 +2060,7 @@ newlacon(struct vars *v,
 	sub = &v->lacons[n];
 	sub->begin = begin;
 	sub->end = end;
-	sub->subno = latype;
+	sub->latype = latype;
 	ZAPCNFA(sub->cnfa);
 	return n;
 }
@@ -2163,7 +2183,7 @@ dump(regex_t *re,
 		struct subre *lasub = &g->lacons[i];
 		const char *latype;
 
-		switch (lasub->subno)
+		switch (lasub->latype)
 		{
 			case LATYPE_AHEAD_POS:
 				latype = "positive lookahead";
@@ -2227,8 +2247,12 @@ stdump(struct subre *t,
 		fprintf(f, " hasbackref");
 	if (!(t->flags & INUSE))
 		fprintf(f, " UNUSED");
-	if (t->subno != 0)
-		fprintf(f, " (#%d)", t->subno);
+	if (t->latype != (char) -1)
+		fprintf(f, " latype(%d)", t->latype);
+	if (t->capno != 0)
+		fprintf(f, " capture(%d)", t->capno);
+	if (t->backno != 0)
+		fprintf(f, " backref(%d)", t->backno);
 	if (t->min != 1 || t->max != 1)
 	{
 		fprintf(f, " {%d,", t->min);
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
index 89d162ed6af..957ceb8137d 100644
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -825,12 +825,12 @@ lacon(struct vars *v,
 	d = getladfa(v, n);
 	if (d == NULL)
 		return 0;
-	if (LATYPE_IS_AHEAD(sub->subno))
+	if (LATYPE_IS_AHEAD(sub->latype))
 	{
 		/* used to use longest() here, but shortest() could be much cheaper */
 		end = shortest(v, d, cp, cp, v->stop,
 					   (chr **) NULL, (int *) NULL);
-		satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
+		satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
 	}
 	else
 	{
@@ -843,7 +843,7 @@ lacon(struct vars *v,
 		 * nominal match.
 		 */
 		satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
-		if (!LATYPE_IS_POS(sub->subno))
+		if (!LATYPE_IS_POS(sub->latype))
 			satisfied = !satisfied;
 	}
 	FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 4541cf9a7e0..2a1ef0176a5 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -640,13 +640,11 @@ static void
 zaptreesubs(struct vars *v,
 			struct subre *t)
 {
+	int			n = t->capno;
 	struct subre *t2;
 
-	if (t->op == '(')
+	if (n > 0)
 	{
-		int			n = t->subno;
-
-		assert(n > 0);
 		if ((size_t) n < v->nmatch)
 		{
 			v->pmatch[n].rm_so = -1;
@@ -667,7 +665,7 @@ subset(struct vars *v,
 	   chr *begin,
 	   chr *end)
 {
-	int			n = sub->subno;
+	int			n = sub->capno;
 
 	assert(n > 0);
 	if ((size_t) n >= v->nmatch)
@@ -739,12 +737,10 @@ cdissect(struct vars *v,
 			else
 				er = citerdissect(v, t, begin, end);
 			break;
-		case '(':				/* capturing */
+		case '(':				/* no-op capture node */
 			assert(t->child != NULL);
-			assert(t->subno > 0);
+			assert(t->capno > 0);
 			er = cdissect(v, t->child, begin, end);
-			if (er == REG_OKAY)
-				subset(v, t, begin, end);
 			break;
 		default:
 			er = REG_ASSERT;
@@ -758,6 +754,12 @@ cdissect(struct vars *v,
 	 */
 	assert(er != REG_NOMATCH || (t->flags & BACKR));
 
+	/*
+	 * If this node is marked as capturing, save successful match's location.
+	 */
+	if (t->capno > 0 && er == REG_OKAY)
+		subset(v, t, begin, end);
+
 	return er;
 }
 
@@ -932,7 +934,7 @@ cbrdissect(struct vars *v,
 		   chr *begin,			/* beginning of relevant substring */
 		   chr *end)			/* end of same */
 {
-	int			n = t->subno;
+	int			n = t->backno;
 	size_t		numreps;
 	size_t		tlen;
 	size_t		brlen;