Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 08c0d6a

Browse files
committed
Invent "rainbow" arcs within the regex engine.
Some regular expression constructs, most notably the "." match-anything metacharacter, produce a sheaf of parallel NFA arcs covering all possible colors (that is, character equivalence classes). We can make a noticeable improvement in the space and time needed to process large regexes by replacing such cases with a single arc bearing the special color code "RAINBOW". This requires only minor additional complication in places such as pull() and push(). Callers of pg_reg_getoutarcs() must now be prepared for the possibility of seeing a RAINBOW arc. For the one known user, contrib/pg_trgm, that's a net benefit since it cuts the number of arcs to be dealt with, and the handling isn't any different than for other colors that contain too many characters to be dealt with individually. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
1 parent 1766118 commit 08c0d6a

File tree

10 files changed

+177
-37
lines changed

10 files changed

+177
-37
lines changed

contrib/pg_trgm/trgm_regexp.c

+18-9
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,8 @@ typedef struct
282282
typedef int TrgmColor;
283283

284284
/* We assume that colors returned by the regexp engine cannot be these: */
285-
#define COLOR_UNKNOWN (-1)
286-
#define COLOR_BLANK (-2)
285+
#define COLOR_UNKNOWN (-3)
286+
#define COLOR_BLANK (-4)
287287

288288
typedef struct
289289
{
@@ -780,7 +780,8 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
780780
palloc0(colorsCount * sizeof(TrgmColorInfo));
781781

782782
/*
783-
* Loop over colors, filling TrgmColorInfo about each.
783+
* Loop over colors, filling TrgmColorInfo about each. Note we include
784+
* WHITE (0) even though we know it'll be reported as non-expandable.
784785
*/
785786
for (i = 0; i < colorsCount; i++)
786787
{
@@ -1098,9 +1099,9 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
10981099
/* Add enter key to this state */
10991100
addKeyToQueue(trgmNFA, &destKey);
11001101
}
1101-
else
1102+
else if (arc->co >= 0)
11021103
{
1103-
/* Regular color */
1104+
/* Regular color (including WHITE) */
11041105
TrgmColorInfo *colorInfo = &trgmNFA->colorInfo[arc->co];
11051106

11061107
if (colorInfo->expandable)
@@ -1156,6 +1157,14 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
11561157
addKeyToQueue(trgmNFA, &destKey);
11571158
}
11581159
}
1160+
else
1161+
{
1162+
/* RAINBOW: treat as unexpandable color */
1163+
destKey.prefix.colors[0] = COLOR_UNKNOWN;
1164+
destKey.prefix.colors[1] = COLOR_UNKNOWN;
1165+
destKey.nstate = arc->to;
1166+
addKeyToQueue(trgmNFA, &destKey);
1167+
}
11591168
}
11601169

11611170
pfree(arcs);
@@ -1216,10 +1225,10 @@ addArcs(TrgmNFA *trgmNFA, TrgmState *state)
12161225
/*
12171226
* Ignore non-expandable colors; addKey already handled the case.
12181227
*
1219-
* We need no special check for begin/end pseudocolors here. We
1220-
* don't need to do any processing for them, and they will be
1221-
* marked non-expandable since the regex engine will have reported
1222-
* them that way.
1228+
* We need no special check for WHITE or begin/end pseudocolors
1229+
* here. We don't need to do any processing for them, and they
1230+
* will be marked non-expandable since the regex engine will have
1231+
* reported them that way.
12231232
*/
12241233
if (!colorInfo->expandable)
12251234
continue;

src/backend/regex/README

+28-8
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,18 @@ and the NFA has these arcs:
261261
states 4 -> 5 on color 2 ("x" only)
262262
which can be seen to be a correct representation of the regex.
263263

264+
There is one more complexity, which is how to handle ".", that is a
265+
match-anything atom. We used to do that by generating a "rainbow"
266+
of arcs of all live colors between the two NFA states before and after
267+
the dot. That's expensive in itself when there are lots of colors,
268+
and it also typically adds lots of follow-on arc-splitting work for the
269+
color splitting logic. Now we handle this case by generating a single arc
270+
labeled with the special color RAINBOW, meaning all colors. Such arcs
271+
never need to be split, so they help keep NFAs small in this common case.
272+
(Note: this optimization doesn't help in REG_NLSTOP mode, where "." is
273+
not supposed to match newline. In that case we still handle "." by
274+
generating an almost-rainbow of all colors except newline's color.)
275+
264276
Given this summary, we can see we need the following operations for
265277
colors:
266278

@@ -349,18 +361,20 @@ The possible arc types are:
349361

350362
PLAIN arcs, which specify matching of any character of a given "color"
351363
(see above). These are dumped as "[color_number]->to_state".
364+
In addition there can be "rainbow" PLAIN arcs, which are dumped as
365+
"[*]->to_state".
352366

353367
EMPTY arcs, which specify a no-op transition to another state. These
354368
are dumped as "->to_state".
355369

356370
AHEAD constraints, which represent a "next character must be of this
357371
color" constraint. AHEAD differs from a PLAIN arc in that the input
358372
character is not consumed when crossing the arc. These are dumped as
359-
">color_number>->to_state".
373+
">color_number>->to_state", or possibly ">*>->to_state".
360374

361375
BEHIND constraints, which represent a "previous character must be of
362376
this color" constraint, which likewise consumes no input. These are
363-
dumped as "<color_number<->to_state".
377+
dumped as "<color_number<->to_state", or possibly "<*<->to_state".
364378

365379
'^' arcs, which specify a beginning-of-input constraint. These are
366380
dumped as "^0->to_state" or "^1->to_state" for beginning-of-string and
@@ -396,14 +410,20 @@ substring, or an imaginary following EOS character if the substring is at
396410
the end of the input.
397411
3. If the NFA is (or can be) in the goal state at this point, it matches.
398412

413+
This definition is necessary to support regexes that begin or end with
414+
constraints such as \m and \M, which imply requirements on the adjacent
415+
character if any. The executor implements that by checking if the
416+
adjacent character (or BOS/BOL/EOS/EOL pseudo-character) is of the
417+
right color, and it does that in the same loop that checks characters
418+
within the match.
419+
399420
So one can mentally execute an untransformed NFA by taking ^ and $ as
400421
ordinary constraints that match at start and end of input; but plain
401422
arcs out of the start state should be taken as matches for the character
402423
before the target substring, and similarly, plain arcs leading to the
403424
post state are matches for the character after the target substring.
404-
This definition is necessary to support regexes that begin or end with
405-
constraints such as \m and \M, which imply requirements on the adjacent
406-
character if any. NFAs for simple unanchored patterns will usually have
407-
pre-state outarcs for all possible character colors as well as BOS and
408-
BOL, and post-state inarcs for all possible character colors as well as
409-
EOS and EOL, so that the executor's behavior will work.
425+
After the optimize() transformation, there are explicit arcs mentioning
426+
BOS/BOL/EOS/EOL adjacent to the pre-state and post-state. So a finished
427+
NFA for a pattern without anchors or adjacent-character constraints will
428+
have pre-state outarcs for RAINBOW (all possible character colors) as well
429+
as BOS and BOL, and likewise post-state inarcs for RAINBOW, EOS, and EOL.

src/backend/regex/regc_color.c

+21-1
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,7 @@ colorchain(struct colormap *cm,
977977
{
978978
struct colordesc *cd = &cm->cd[a->co];
979979

980+
assert(a->co >= 0);
980981
if (cd->arcs != NULL)
981982
cd->arcs->colorchainRev = a;
982983
a->colorchain = cd->arcs;
@@ -994,6 +995,7 @@ uncolorchain(struct colormap *cm,
994995
struct colordesc *cd = &cm->cd[a->co];
995996
struct arc *aa = a->colorchainRev;
996997

998+
assert(a->co >= 0);
997999
if (aa == NULL)
9981000
{
9991001
assert(cd->arcs == a);
@@ -1012,6 +1014,9 @@ uncolorchain(struct colormap *cm,
10121014

10131015
/*
10141016
* rainbow - add arcs of all full colors (but one) between specified states
1017+
*
1018+
* If there isn't an exception color, we now generate just a single arc
1019+
* labeled RAINBOW, saving lots of arc-munging later on.
10151020
*/
10161021
static void
10171022
rainbow(struct nfa *nfa,
@@ -1025,6 +1030,13 @@ rainbow(struct nfa *nfa,
10251030
struct colordesc *end = CDEND(cm);
10261031
color co;
10271032

1033+
if (but == COLORLESS)
1034+
{
1035+
newarc(nfa, type, RAINBOW, from, to);
1036+
return;
1037+
}
1038+
1039+
/* Gotta do it the hard way. Skip subcolors, pseudocolors, and "but" */
10281040
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
10291041
if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but &&
10301042
!(cd->flags & PSEUDO))
@@ -1034,13 +1046,16 @@ rainbow(struct nfa *nfa,
10341046
/*
10351047
* colorcomplement - add arcs of complementary colors
10361048
*
1049+
* We add arcs of all colors that are not pseudocolors and do not match
1050+
* any of the "of" state's PLAIN outarcs.
1051+
*
10371052
* The calling sequence ought to be reconciled with cloneouts().
10381053
*/
10391054
static void
10401055
colorcomplement(struct nfa *nfa,
10411056
struct colormap *cm,
10421057
int type,
1043-
struct state *of, /* complements of this guy's PLAIN outarcs */
1058+
struct state *of,
10441059
struct state *from,
10451060
struct state *to)
10461061
{
@@ -1049,6 +1064,11 @@ colorcomplement(struct nfa *nfa,
10491064
color co;
10501065

10511066
assert(of != from);
1067+
1068+
/* A RAINBOW arc matches all colors, making the complement empty */
1069+
if (findarc(of, PLAIN, RAINBOW) != NULL)
1070+
return;
1071+
10521072
for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
10531073
if (!UNUSEDCOLOR(cd) && !(cd->flags & PSEUDO))
10541074
if (findarc(of, PLAIN, co) == NULL)

src/backend/regex/regc_nfa.c

+74-8
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,11 @@ destroystate(struct nfa *nfa,
271271
*
272272
* This function checks to make sure that no duplicate arcs are created.
273273
* In general we never want duplicates.
274+
*
275+
* However: in principle, a RAINBOW arc is redundant with any plain arc
276+
* (unless that arc is for a pseudocolor). But we don't try to recognize
277+
* that redundancy, either here or in allied operations such as moveins().
278+
* The pseudocolor consideration makes that more costly than it seems worth.
274279
*/
275280
static void
276281
newarc(struct nfa *nfa,
@@ -1170,6 +1175,9 @@ copyouts(struct nfa *nfa,
11701175

11711176
/*
11721177
* cloneouts - copy out arcs of a state to another state pair, modifying type
1178+
*
1179+
* This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
1180+
* the same interpretation of "co". It wouldn't be sensible with LACONs.
11731181
*/
11741182
static void
11751183
cloneouts(struct nfa *nfa,
@@ -1181,9 +1189,13 @@ cloneouts(struct nfa *nfa,
11811189
struct arc *a;
11821190

11831191
assert(old != from);
1192+
assert(type == AHEAD || type == BEHIND);
11841193

11851194
for (a = old->outs; a != NULL; a = a->outchain)
1195+
{
1196+
assert(a->type == PLAIN);
11861197
newarc(nfa, type, a->co, from, to);
1198+
}
11871199
}
11881200

11891201
/*
@@ -1597,7 +1609,7 @@ pull(struct nfa *nfa,
15971609
for (a = from->ins; a != NULL && !NISERR(); a = nexta)
15981610
{
15991611
nexta = a->inchain;
1600-
switch (combine(con, a))
1612+
switch (combine(nfa, con, a))
16011613
{
16021614
case INCOMPATIBLE: /* destroy the arc */
16031615
freearc(nfa, a);
@@ -1624,6 +1636,10 @@ pull(struct nfa *nfa,
16241636
cparc(nfa, a, s, to);
16251637
freearc(nfa, a);
16261638
break;
1639+
case REPLACEARC: /* replace arc's color */
1640+
newarc(nfa, a->type, con->co, a->from, to);
1641+
freearc(nfa, a);
1642+
break;
16271643
default:
16281644
assert(NOTREACHED);
16291645
break;
@@ -1764,7 +1780,7 @@ push(struct nfa *nfa,
17641780
for (a = to->outs; a != NULL && !NISERR(); a = nexta)
17651781
{
17661782
nexta = a->outchain;
1767-
switch (combine(con, a))
1783+
switch (combine(nfa, con, a))
17681784
{
17691785
case INCOMPATIBLE: /* destroy the arc */
17701786
freearc(nfa, a);
@@ -1791,6 +1807,10 @@ push(struct nfa *nfa,
17911807
cparc(nfa, a, from, s);
17921808
freearc(nfa, a);
17931809
break;
1810+
case REPLACEARC: /* replace arc's color */
1811+
newarc(nfa, a->type, con->co, from, a->to);
1812+
freearc(nfa, a);
1813+
break;
17941814
default:
17951815
assert(NOTREACHED);
17961816
break;
@@ -1810,9 +1830,11 @@ push(struct nfa *nfa,
18101830
* #def INCOMPATIBLE 1 // destroys arc
18111831
* #def SATISFIED 2 // constraint satisfied
18121832
* #def COMPATIBLE 3 // compatible but not satisfied yet
1833+
* #def REPLACEARC 4 // replace arc's color with constraint color
18131834
*/
18141835
static int
1815-
combine(struct arc *con,
1836+
combine(struct nfa *nfa,
1837+
struct arc *con,
18161838
struct arc *a)
18171839
{
18181840
#define CA(ct,at) (((ct)<<CHAR_BIT) | (at))
@@ -1827,14 +1849,46 @@ combine(struct arc *con,
18271849
case CA(BEHIND, PLAIN):
18281850
if (con->co == a->co)
18291851
return SATISFIED;
1852+
if (con->co == RAINBOW)
1853+
{
1854+
/* con is satisfied unless arc's color is a pseudocolor */
1855+
if (!(nfa->cm->cd[a->co].flags & PSEUDO))
1856+
return SATISFIED;
1857+
}
1858+
else if (a->co == RAINBOW)
1859+
{
1860+
/* con is incompatible if it's for a pseudocolor */
1861+
if (nfa->cm->cd[con->co].flags & PSEUDO)
1862+
return INCOMPATIBLE;
1863+
/* otherwise, constraint constrains arc to be only its color */
1864+
return REPLACEARC;
1865+
}
18301866
return INCOMPATIBLE;
18311867
break;
18321868
case CA('^', '^'): /* collision, similar constraints */
18331869
case CA('$', '$'):
1834-
case CA(AHEAD, AHEAD):
1870+
if (con->co == a->co) /* true duplication */
1871+
return SATISFIED;
1872+
return INCOMPATIBLE;
1873+
break;
1874+
case CA(AHEAD, AHEAD): /* collision, similar constraints */
18351875
case CA(BEHIND, BEHIND):
18361876
if (con->co == a->co) /* true duplication */
18371877
return SATISFIED;
1878+
if (con->co == RAINBOW)
1879+
{
1880+
/* con is satisfied unless arc's color is a pseudocolor */
1881+
if (!(nfa->cm->cd[a->co].flags & PSEUDO))
1882+
return SATISFIED;
1883+
}
1884+
else if (a->co == RAINBOW)
1885+
{
1886+
/* con is incompatible if it's for a pseudocolor */
1887+
if (nfa->cm->cd[con->co].flags & PSEUDO)
1888+
return INCOMPATIBLE;
1889+
/* otherwise, constraint constrains arc to be only its color */
1890+
return REPLACEARC;
1891+
}
18381892
return INCOMPATIBLE;
18391893
break;
18401894
case CA('^', BEHIND): /* collision, dissimilar constraints */
@@ -2895,6 +2949,7 @@ compact(struct nfa *nfa,
28952949
break;
28962950
case LACON:
28972951
assert(s->no != cnfa->pre);
2952+
assert(a->co >= 0);
28982953
ca->co = (color) (cnfa->ncolors + a->co);
28992954
ca->to = a->to->no;
29002955
ca++;
@@ -3068,13 +3123,22 @@ dumparc(struct arc *a,
30683123
switch (a->type)
30693124
{
30703125
case PLAIN:
3071-
fprintf(f, "[%ld]", (long) a->co);
3126+
if (a->co == RAINBOW)
3127+
fprintf(f, "[*]");
3128+
else
3129+
fprintf(f, "[%ld]", (long) a->co);
30723130
break;
30733131
case AHEAD:
3074-
fprintf(f, ">%ld>", (long) a->co);
3132+
if (a->co == RAINBOW)
3133+
fprintf(f, ">*>");
3134+
else
3135+
fprintf(f, ">%ld>", (long) a->co);
30753136
break;
30763137
case BEHIND:
3077-
fprintf(f, "<%ld<", (long) a->co);
3138+
if (a->co == RAINBOW)
3139+
fprintf(f, "<*<");
3140+
else
3141+
fprintf(f, "<%ld<", (long) a->co);
30783142
break;
30793143
case LACON:
30803144
fprintf(f, ":%ld:", (long) a->co);
@@ -3161,7 +3225,9 @@ dumpcstate(int st,
31613225
pos = 1;
31623226
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
31633227
{
3164-
if (ca->co < cnfa->ncolors)
3228+
if (ca->co == RAINBOW)
3229+
fprintf(f, "\t[*]->%d", ca->to);
3230+
else if (ca->co < cnfa->ncolors)
31653231
fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
31663232
else
31673233
fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);

0 commit comments

Comments
 (0)