Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0c3405c

Browse files
committed
Improve performance of regular expression back-references.
In some cases, at the time that we're doing an NFA-based precheck of whether a backref subexpression can match at a particular place in the string, we already know which substring the referenced subexpression matched. If so, we might as well forget about the NFA and just compare the substring; this is faster and it gives an exact rather than approximate answer. In general, this optimization can help while we are prechecking within the second child expression of a concat node, while the capture was within the first child expression; then the substring was saved during cdissect() of the first child and will be available to NFA checks done while cdissect() recurses into the second child. It can help quite a lot if the tree looks like concat / \ capture concat / \ expensive stuff backref as we will be able to avoid recursively dissecting the "expensive stuff" before discovering that the backref isn't satisfied with a particular midpoint that the lower concat node is testing. This doesn't help if the concat tree is left-deep, as the capture node won't get set soon enough (and it's hard to fix that without changing the engine's match behavior). Fortunately, right-deep concat trees are the common case. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/661609.1614560029@sss.pgh.pa.us
1 parent 4aea704 commit 0c3405c

File tree

2 files changed

+138
-3
lines changed

2 files changed

+138
-3
lines changed

src/backend/regex/rege_dfa.c

+117
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,19 @@ longest(struct vars *v,
5858
if (hitstopp != NULL)
5959
*hitstopp = 0;
6060

61+
/* if this is a backref to a known string, just match against that */
62+
if (d->backno >= 0)
63+
{
64+
assert((size_t) d->backno < v->nmatch);
65+
if (v->pmatch[d->backno].rm_so >= 0)
66+
{
67+
cp = dfa_backref(v, d, start, start, stop, false);
68+
if (cp == v->stop && stop == v->stop && hitstopp != NULL)
69+
*hitstopp = 1;
70+
return cp;
71+
}
72+
}
73+
6174
/* fast path for matchall NFAs */
6275
if (d->cnfa->flags & MATCHALL)
6376
{
@@ -210,6 +223,20 @@ shortest(struct vars *v,
210223
if (hitstopp != NULL)
211224
*hitstopp = 0;
212225

226+
/* if this is a backref to a known string, just match against that */
227+
if (d->backno >= 0)
228+
{
229+
assert((size_t) d->backno < v->nmatch);
230+
if (v->pmatch[d->backno].rm_so >= 0)
231+
{
232+
cp = dfa_backref(v, d, start, min, max, true);
233+
if (cp != NULL && coldp != NULL)
234+
*coldp = start;
235+
/* there is no case where we should set *hitstopp */
236+
return cp;
237+
}
238+
}
239+
213240
/* fast path for matchall NFAs */
214241
if (d->cnfa->flags & MATCHALL)
215242
{
@@ -467,6 +494,94 @@ matchuntil(struct vars *v,
467494
return 1;
468495
}
469496

497+
/*
498+
* dfa_backref - find best match length for a known backref string
499+
*
500+
* When the backref's referent is already available, we can deliver an exact
501+
* answer with considerably less work than running the backref node's NFA.
502+
*
503+
* Return match endpoint for longest or shortest valid repeated match,
504+
* or NULL if there is no valid match.
505+
*
506+
* Should be in sync with cbrdissect(), although that has the different task
507+
* of checking a match to a predetermined section of the string.
508+
*/
509+
static chr *
510+
dfa_backref(struct vars *v,
511+
struct dfa *d,
512+
chr *start, /* where the match should start */
513+
chr *min, /* match must end at or after here */
514+
chr *max, /* match must end at or before here */
515+
bool shortest)
516+
{
517+
int n = d->backno;
518+
int backmin = d->backmin;
519+
int backmax = d->backmax;
520+
size_t numreps;
521+
size_t minreps;
522+
size_t maxreps;
523+
size_t brlen;
524+
chr *brstring;
525+
chr *p;
526+
527+
/* get the backreferenced string (caller should have checked this) */
528+
if (v->pmatch[n].rm_so == -1)
529+
return NULL;
530+
brstring = v->start + v->pmatch[n].rm_so;
531+
brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
532+
533+
/* special-case zero-length backreference to avoid divide by zero */
534+
if (brlen == 0)
535+
{
536+
/*
537+
* matches only a zero-length string, but any number of repetitions
538+
* can be considered to be present
539+
*/
540+
if (min == start && backmin <= backmax)
541+
return start;
542+
return NULL;
543+
}
544+
545+
/*
546+
* convert min and max into numbers of possible repetitions of the backref
547+
* string, rounding appropriately
548+
*/
549+
if (min <= start)
550+
minreps = 0;
551+
else
552+
minreps = (min - start - 1) / brlen + 1;
553+
maxreps = (max - start) / brlen;
554+
555+
/* apply bounds, then see if there is any allowed match length */
556+
if (minreps < backmin)
557+
minreps = backmin;
558+
if (backmax != DUPINF && maxreps > backmax)
559+
maxreps = backmax;
560+
if (maxreps < minreps)
561+
return NULL;
562+
563+
/* quick exit if zero-repetitions match is valid and preferred */
564+
if (shortest && minreps == 0)
565+
return start;
566+
567+
/* okay, compare the actual string contents */
568+
p = start;
569+
numreps = 0;
570+
while (numreps < maxreps)
571+
{
572+
if ((*v->g->compare) (brstring, p, brlen) != 0)
573+
break;
574+
p += brlen;
575+
numreps++;
576+
if (shortest && numreps >= minreps)
577+
break;
578+
}
579+
580+
if (numreps >= minreps)
581+
return p;
582+
return NULL;
583+
}
584+
470585
/*
471586
* lastcold - determine last point at which no progress had been made
472587
*/
@@ -563,6 +678,8 @@ newdfa(struct vars *v,
563678
d->lastpost = NULL;
564679
d->lastnopr = NULL;
565680
d->search = d->ssets;
681+
d->backno = -1; /* may be set by caller */
682+
d->backmin = d->backmax = 0;
566683

567684
/* initialization of sset fields is done as needed */
568685

src/backend/regex/regexec.c

+21-3
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ struct dfa
7777
chr *lastpost; /* location of last cache-flushed success */
7878
chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
7979
struct sset *search; /* replacement-search-pointer memory */
80+
int backno; /* if DFA for a backref, subno it refers to */
81+
short backmin; /* min repetitions for backref */
82+
short backmax; /* max repetitions for backref */
8083
bool ismalloced; /* should this struct dfa be freed? */
8184
bool arraysmalloced; /* should its subsidiary arrays be freed? */
8285
};
@@ -154,6 +157,7 @@ static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
154157
static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
155158
static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
156159
static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
160+
static chr *dfa_backref(struct vars *, struct dfa *, chr *, chr *, chr *, bool);
157161
static chr *lastcold(struct vars *, struct dfa *);
158162
static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
159163
static void freedfa(struct dfa *);
@@ -342,13 +346,23 @@ static struct dfa *
342346
getsubdfa(struct vars *v,
343347
struct subre *t)
344348
{
345-
if (v->subdfas[t->id] == NULL)
349+
struct dfa *d = v->subdfas[t->id];
350+
351+
if (d == NULL)
346352
{
347-
v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
353+
d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
348354
if (ISERR())
349355
return NULL;
356+
/* set up additional info if this is a backref node */
357+
if (t->op == 'b')
358+
{
359+
d->backno = t->backno;
360+
d->backmin = t->min;
361+
d->backmax = t->max;
362+
}
363+
v->subdfas[t->id] = d;
350364
}
351-
return v->subdfas[t->id];
365+
return d;
352366
}
353367

354368
/*
@@ -369,6 +383,7 @@ getladfa(struct vars *v,
369383
v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
370384
if (ISERR())
371385
return NULL;
386+
/* a LACON can't contain a backref, so nothing else to do */
372387
}
373388
return v->ladfas[n];
374389
}
@@ -927,6 +942,9 @@ crevcondissect(struct vars *v,
927942

928943
/*
929944
* cbrdissect - dissect match for backref node
945+
*
946+
* The backref match might already have been verified by dfa_backref(),
947+
* but we don't know that for sure so must check it here.
930948
*/
931949
static int /* regexec return code */
932950
cbrdissect(struct vars *v,

0 commit comments

Comments
 (0)