From 12c9a04008870c283931d6b3b648ee21bbc2cfda Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Fri, 30 Oct 2015 19:14:19 -0400 Subject: Implement lookbehind constraints in our regular-expression engine. A lookbehind constraint is like a lookahead constraint in that it consumes no text; but it checks for existence (or nonexistence) of a match *ending* at the current point in the string, rather than one *starting* at the current point. This is a long-requested feature since it exists in many other regex libraries, but Henry Spencer had never got around to implementing it in the code we use. Just making it work is actually pretty trivial; but naive copying of the logic for lookahead constraints leads to code that often spends O(N^2) time to scan an N-character string, because we have to run the match engine from string start to the current probe point each time the constraint is checked. In typical use-cases a lookbehind constraint will be written at the start of the regex and hence will need to be checked at every character --- so O(N^2) work overall. To fix that, I introduced a third copy of the core DFA matching loop, paralleling the existing longest() and shortest() loops. This version, matchuntil(), can suspend and resume matching given a couple of pointers' worth of storage space. So we need only run it across the string once, stopping at each interesting probe point and then resuming to advance to the next one. I also put in an optimization that simplifies one-character lookahead and lookbehind constraints, such as "(?=x)" or "(?nlacons == 0 || v->lacons != NULL); for (i = 1; i < v->nlacons; i++) { + struct subre *lasub = &v->lacons[i]; + #ifdef REG_DEBUG if (debug != NULL) fprintf(debug, "\n\n\n========= LA%d ==========\n", i); #endif - nfanode(v, &v->lacons[i], debug); + + /* Prepend .* to pattern if it's a lookbehind LACON */ + nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug); } CNOERR(); if (v->tree->flags & SHORTER) @@ -640,7 +648,7 @@ makesearch(struct vars * v, static struct subre * parse(struct vars * v, int stopper, /* EOS or ')' */ - int type, /* LACON (lookahead subRE) or PLAIN */ + int type, /* LACON (lookaround subRE) or PLAIN */ struct state * init, /* initial state */ struct state * final) /* final state */ { @@ -719,7 +727,7 @@ parse(struct vars * v, static struct subre * parsebranch(struct vars * v, int stopper, /* EOS or ')' */ - int type, /* LACON (lookahead subRE) or PLAIN */ + int type, /* LACON (lookaround subRE) or PLAIN */ struct state * left, /* leftmost state */ struct state * right, /* rightmost state */ int partial) /* is this only part of a branch? */ @@ -768,7 +776,7 @@ parsebranch(struct vars * v, static void parseqatom(struct vars * v, int stopper, /* EOS or ')' */ - int type, /* LACON (lookahead subRE) or PLAIN */ + int type, /* LACON (lookaround subRE) or PLAIN */ struct state * lp, /* left state to hang it on */ struct state * rp, /* right state to hang it on */ struct subre * top) /* subtree top */ @@ -782,7 +790,7 @@ parseqatom(struct vars * v, struct subre *atom; /* atom's subtree */ struct subre *t; int cap; /* capturing parens? */ - int pos; /* positive lookahead? */ + int latype; /* lookaround constraint type */ int subno; /* capturing-parens or backref number */ int atomtype; int qprefer; /* quantifier short/long preference */ @@ -866,19 +874,18 @@ parseqatom(struct vars * v, nonword(v, AHEAD, s, rp); return; break; - case LACON: /* lookahead constraint */ - pos = v->nextvalue; + case LACON: /* lookaround constraint */ + latype = v->nextvalue; NEXT(); s = newstate(v->nfa); s2 = newstate(v->nfa); NOERR(); t = parse(v, ')', LACON, s, s2); freesubre(v, t); /* internal structure irrelevant */ - assert(SEE(')') || ISERR()); - NEXT(); - n = newlacon(v, s, s2, pos); NOERR(); - ARCV(LACON, n); + assert(SEE(')')); + NEXT(); + processlacon(v, s, s2, latype, lp, rp); return; break; /* then errors, to get them out of the way */ @@ -1633,6 +1640,75 @@ wordchrs(struct vars * v) v->wordchrs = left; } +/* + * processlacon - generate the NFA representation of a LACON + * + * In the general case this is just newlacon() + newarc(), but some cases + * can be optimized. + */ +static void +processlacon(struct vars * v, + struct state * begin, /* start of parsed LACON sub-re */ + struct state * end, /* end of parsed LACON sub-re */ + int latype, + struct state * lp, /* left state to hang it on */ + struct state * rp) /* right state to hang it on */ +{ + struct state *s1; + int n; + + /* + * Check for lookaround RE consisting of a single plain color arc (or set + * of arcs); this would typically be a simple chr or a bracket expression. + */ + s1 = single_color_transition(begin, end); + switch (latype) + { + case LATYPE_AHEAD_POS: + /* If lookahead RE is just colorset C, convert to AHEAD(C) */ + if (s1 != NULL) + { + cloneouts(v->nfa, s1, lp, rp, AHEAD); + return; + } + break; + case LATYPE_AHEAD_NEG: + /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */ + if (s1 != NULL) + { + colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp); + newarc(v->nfa, '$', 1, lp, rp); + newarc(v->nfa, '$', 0, lp, rp); + return; + } + break; + case LATYPE_BEHIND_POS: + /* If lookbehind RE is just colorset C, convert to BEHIND(C) */ + if (s1 != NULL) + { + cloneouts(v->nfa, s1, lp, rp, BEHIND); + return; + } + break; + case LATYPE_BEHIND_NEG: + /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */ + if (s1 != NULL) + { + colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp); + newarc(v->nfa, '^', 1, lp, rp); + newarc(v->nfa, '^', 0, lp, rp); + return; + } + break; + default: + assert(NOTREACHED); + } + + /* General case: we need a LACON subre and arc */ + n = newlacon(v, begin, end, latype); + newarc(v->nfa, LACON, n, lp, rp); +} + /* * subre - allocate a subre */ @@ -1826,15 +1902,18 @@ nfatree(struct vars * v, if (t->right != NULL) (DISCARD) nfatree(v, t->right, f); - return nfanode(v, t, f); + return nfanode(v, t, 0, f); } /* - * nfanode - do one NFA for nfatree + * nfanode - do one NFA for nfatree or lacons + * + * If converttosearch is true, apply makesearch() to the NFA. */ static long /* optimize results */ nfanode(struct vars * v, struct subre * t, + int converttosearch, FILE *f) /* for debug output */ { struct nfa *nfa; @@ -1855,10 +1934,11 @@ nfanode(struct vars * v, NOERRZ(); dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final); if (!ISERR()) - { specialcolors(nfa); + if (!ISERR()) ret = optimize(nfa, f); - } + if (converttosearch && !ISERR()) + makesearch(v, nfa); if (!ISERR()) compact(nfa, &t->cnfa); @@ -1867,13 +1947,13 @@ nfanode(struct vars * v, } /* - * newlacon - allocate a lookahead-constraint subRE + * newlacon - allocate a lookaround-constraint subRE */ static int /* lacon number */ newlacon(struct vars * v, struct state * begin, struct state * end, - int pos) + int latype) { int n; struct subre *newlacons; @@ -1900,13 +1980,13 @@ newlacon(struct vars * v, sub = &v->lacons[n]; sub->begin = begin; sub->end = end; - sub->subno = pos; + sub->subno = latype; ZAPCNFA(sub->cnfa); return n; } /* - * freelacons - free lookahead-constraint subRE vector + * freelacons - free lookaround-constraint subRE vector */ static void freelacons(struct subre * subs, @@ -2020,9 +2100,29 @@ dump(regex_t *re, } for (i = 1; i < g->nlacons; i++) { - fprintf(f, "\nla%d (%s):\n", i, - (g->lacons[i].subno) ? "positive" : "negative"); - dumpcnfa(&g->lacons[i].cnfa, f); + struct subre *lasub = &g->lacons[i]; + const char *latype; + + switch (lasub->subno) + { + case LATYPE_AHEAD_POS: + latype = "positive lookahead"; + break; + case LATYPE_AHEAD_NEG: + latype = "negative lookahead"; + break; + case LATYPE_BEHIND_POS: + latype = "positive lookbehind"; + break; + case LATYPE_BEHIND_NEG: + latype = "negative lookbehind"; + break; + default: + latype = "???"; + break; + } + fprintf(f, "\nla%d (%s):\n", i, latype); + dumpcnfa(&lasub->cnfa, f); } fprintf(f, "\n"); dumpst(g->tree, f, 0); -- cgit v1.2.3