Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/regex')
-rw-r--r--src/backend/regex/README15
-rw-r--r--src/backend/regex/regcomp.c54
-rw-r--r--src/backend/regex/rege_dfa.c6
-rw-r--r--src/backend/regex/regexec.c22
4 files changed, 67 insertions, 30 deletions
diff --git a/src/backend/regex/README b/src/backend/regex/README
index cafeb3dffb0..e4b083664f2 100644
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -130,8 +130,8 @@ possibilities.
As an example, consider the regex "(a[bc]+)\1". The compiled
representation will have a top-level concatenation subre node. Its first
-child is a capture node, and the child of that is a plain DFA node for
-"a[bc]+". The concatenation's second child is a backref node for \1.
+child is a plain DFA node for "a[bc]+" (which is marked as being a capture
+node). The concatenation's second child is a backref node for \1.
The DFA associated with the concatenation node will be "a[bc]+a[bc]+",
where the backref has been replaced by a copy of the DFA for its referent
expression. When executed, the concatenation node will have to search for
@@ -147,6 +147,17 @@ run much faster than a pure NFA engine could do. It is this behavior that
justifies using the phrase "hybrid DFA/NFA engine" to describe Spencer's
library.
+It's perhaps worth noting that separate capture subre nodes are a rarity:
+normally, we just mark a subre as capturing and that's it. However, it's
+legal to write a regex like "((x))" in which the same substring has to be
+captured by multiple sets of parentheses. Since a subre has room for only
+one "capno" field, a single subre can't handle that. We handle such cases
+by wrapping the base subre (which captures the innermost parens) in a
+no-op capture node, or even more than one for "(((x)))" etc. This is a
+little bit inefficient because we end up with multiple identical NFAs,
+but since the case is pointless and infrequent, it's not worth working
+harder.
+
Colors and colormapping
-----------------------
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index c0680b280bc..0cd4b4c4c29 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -452,7 +452,7 @@ pg_regcomp(regex_t *re,
#endif
/* Prepend .* to pattern if it's a lookbehind LACON */
- nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
+ nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->latype), debug);
}
CNOERR();
if (v->tree->flags & SHORTER)
@@ -944,7 +944,13 @@ parseqatom(struct vars *v,
else
atomtype = PLAIN; /* something that's not '(' */
NEXT();
- /* need new endpoints because tree will contain pointers */
+
+ /*
+ * Make separate endpoints to ensure we keep this sub-NFA cleanly
+ * separate from what surrounds it. We need to be sure that when
+ * we duplicate the sub-NFA for a backref, we get the right states
+ * and no others.
+ */
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERR();
@@ -959,11 +965,21 @@ parseqatom(struct vars *v,
{
assert(v->subs[subno] == NULL);
v->subs[subno] = atom;
- t = subre(v, '(', atom->flags | CAP, lp, rp);
- NOERR();
- t->subno = subno;
- t->child = atom;
- atom = t;
+ if (atom->capno == 0)
+ {
+ /* normal case: just mark the atom as capturing */
+ atom->flags |= CAP;
+ atom->capno = subno;
+ }
+ else
+ {
+ /* generate no-op wrapper node to handle "((x))" */
+ t = subre(v, '(', atom->flags | CAP, lp, rp);
+ NOERR();
+ t->capno = subno;
+ t->child = atom;
+ atom = t;
+ }
}
/* postpone everything else pending possible {0} */
break;
@@ -976,7 +992,7 @@ parseqatom(struct vars *v,
atom = subre(v, 'b', BACKR, lp, rp);
NOERR();
subno = v->nextvalue;
- atom->subno = subno;
+ atom->backno = subno;
EMPTYARC(lp, rp); /* temporarily, so there's something */
NEXT();
break;
@@ -1276,8 +1292,10 @@ parseqatom(struct vars *v,
freesubre(v, top->child);
top->op = t->op;
top->flags = t->flags;
+ top->latype = t->latype;
top->id = t->id;
- top->subno = t->subno;
+ top->capno = t->capno;
+ top->backno = t->backno;
top->min = t->min;
top->max = t->max;
top->child = t->child;
@@ -1790,8 +1808,10 @@ subre(struct vars *v,
ret->op = op;
ret->flags = flags;
+ ret->latype = (char) -1;
ret->id = 0; /* will be assigned later */
- ret->subno = 0;
+ ret->capno = 0;
+ ret->backno = 0;
ret->min = ret->max = 1;
ret->child = NULL;
ret->sibling = NULL;
@@ -1893,7 +1913,7 @@ numst(struct subre *t,
assert(t != NULL);
i = start;
- t->id = (short) i++;
+ t->id = i++;
for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
i = numst(t2, i);
return i;
@@ -2040,7 +2060,7 @@ newlacon(struct vars *v,
sub = &v->lacons[n];
sub->begin = begin;
sub->end = end;
- sub->subno = latype;
+ sub->latype = latype;
ZAPCNFA(sub->cnfa);
return n;
}
@@ -2163,7 +2183,7 @@ dump(regex_t *re,
struct subre *lasub = &g->lacons[i];
const char *latype;
- switch (lasub->subno)
+ switch (lasub->latype)
{
case LATYPE_AHEAD_POS:
latype = "positive lookahead";
@@ -2227,8 +2247,12 @@ stdump(struct subre *t,
fprintf(f, " hasbackref");
if (!(t->flags & INUSE))
fprintf(f, " UNUSED");
- if (t->subno != 0)
- fprintf(f, " (#%d)", t->subno);
+ if (t->latype != (char) -1)
+ fprintf(f, " latype(%d)", t->latype);
+ if (t->capno != 0)
+ fprintf(f, " capture(%d)", t->capno);
+ if (t->backno != 0)
+ fprintf(f, " backref(%d)", t->backno);
if (t->min != 1 || t->max != 1)
{
fprintf(f, " {%d,", t->min);
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
index 89d162ed6af..957ceb8137d 100644
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -825,12 +825,12 @@ lacon(struct vars *v,
d = getladfa(v, n);
if (d == NULL)
return 0;
- if (LATYPE_IS_AHEAD(sub->subno))
+ if (LATYPE_IS_AHEAD(sub->latype))
{
/* used to use longest() here, but shortest() could be much cheaper */
end = shortest(v, d, cp, cp, v->stop,
(chr **) NULL, (int *) NULL);
- satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
+ satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
}
else
{
@@ -843,7 +843,7 @@ lacon(struct vars *v,
* nominal match.
*/
satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
- if (!LATYPE_IS_POS(sub->subno))
+ if (!LATYPE_IS_POS(sub->latype))
satisfied = !satisfied;
}
FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 4541cf9a7e0..2a1ef0176a5 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -640,13 +640,11 @@ static void
zaptreesubs(struct vars *v,
struct subre *t)
{
+ int n = t->capno;
struct subre *t2;
- if (t->op == '(')
+ if (n > 0)
{
- int n = t->subno;
-
- assert(n > 0);
if ((size_t) n < v->nmatch)
{
v->pmatch[n].rm_so = -1;
@@ -667,7 +665,7 @@ subset(struct vars *v,
chr *begin,
chr *end)
{
- int n = sub->subno;
+ int n = sub->capno;
assert(n > 0);
if ((size_t) n >= v->nmatch)
@@ -739,12 +737,10 @@ cdissect(struct vars *v,
else
er = citerdissect(v, t, begin, end);
break;
- case '(': /* capturing */
+ case '(': /* no-op capture node */
assert(t->child != NULL);
- assert(t->subno > 0);
+ assert(t->capno > 0);
er = cdissect(v, t->child, begin, end);
- if (er == REG_OKAY)
- subset(v, t, begin, end);
break;
default:
er = REG_ASSERT;
@@ -758,6 +754,12 @@ cdissect(struct vars *v,
*/
assert(er != REG_NOMATCH || (t->flags & BACKR));
+ /*
+ * If this node is marked as capturing, save successful match's location.
+ */
+ if (t->capno > 0 && er == REG_OKAY)
+ subset(v, t, begin, end);
+
return er;
}
@@ -932,7 +934,7 @@ cbrdissect(struct vars *v,
chr *begin, /* beginning of relevant substring */
chr *end) /* end of same */
{
- int n = t->subno;
+ int n = t->backno;
size_t numreps;
size_t tlen;
size_t brlen;