Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 78a843f

Browse files
committed
Improve regex compiler's arc moving/copying logic.
The functions moveins(), copyins(), moveouts(), copyouts() are required to preserve the invariant that there are no duplicate arcs in the regex's NFA. Spencer's original implementation of them was O(N^2) since it checked separately for a match to each source arc. In commit 579840c I improved that by adding sort/merge logic to be used if more than a few arcs are to be moved/copied. However, I now realize that that missed a bet. At many call sites, the target state is newly made and cannot have any existing in-arcs (respectively out-arcs) that could be duplicates. So spending any cycles at all on checking for duplicates is wasted effort; in these cases we can just blindly move/copy all the source arcs. Add code paths to do that. It turns out that for copyins()/copyouts(), *all* the call sites have this property, making all the "improved" logic in them flat out unreachable. Perhaps we'll need the full capability again someday, so I just #ifdef'd those paths out rather than removing them entirely. In passing, add a few test cases to improve code coverage in this area as well as in regc_locale.c/regc_pg_locale.c. Discussion: https://postgr.es/m/810272.1629064063@sss.pgh.pa.us
1 parent f576de1 commit 78a843f

File tree

5 files changed

+338
-4
lines changed

5 files changed

+338
-4
lines changed

src/backend/regex/regc_nfa.c

+62-4
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,10 @@ sortouts_cmp(const void *a, const void *b)
777777
* However, if we have a whole lot of arcs to deal with, retail duplicate
778778
* checks become too slow. In that case we proceed by sorting and merging
779779
* the arc lists, and then we can indeed just update the arcs in-place.
780+
*
781+
* On the other hand, it's also true that this is frequently called with
782+
* a brand-new newState that has no existing in-arcs. In that case,
783+
* de-duplication is unnecessary, so we can just blindly move all the arcs.
780784
*/
781785
static void
782786
moveins(struct nfa *nfa,
@@ -785,7 +789,18 @@ moveins(struct nfa *nfa,
785789
{
786790
assert(oldState != newState);
787791

788-
if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
792+
if (newState->nins == 0)
793+
{
794+
/* No need for de-duplication */
795+
struct arc *a;
796+
797+
while ((a = oldState->ins) != NULL)
798+
{
799+
createarc(nfa, a->type, a->co, a->from, newState);
800+
freearc(nfa, a);
801+
}
802+
}
803+
else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
789804
{
790805
/* With not too many arcs, just do them one at a time */
791806
struct arc *a;
@@ -869,15 +884,30 @@ moveins(struct nfa *nfa,
869884

870885
/*
871886
* copyins - copy in arcs of a state to another state
887+
*
888+
* The comments for moveins() apply here as well. However, in current
889+
* usage, this is *only* called with brand-new target states, so that
890+
* only the "no need for de-duplication" code path is ever reached.
891+
* We keep the rest #ifdef'd out in case it's needed in the future.
872892
*/
873893
static void
874894
copyins(struct nfa *nfa,
875895
struct state *oldState,
876896
struct state *newState)
877897
{
878898
assert(oldState != newState);
899+
assert(newState->nins == 0); /* see comment above */
900+
901+
if (newState->nins == 0)
902+
{
903+
/* No need for de-duplication */
904+
struct arc *a;
879905

880-
if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
906+
for (a = oldState->ins; a != NULL; a = a->inchain)
907+
createarc(nfa, a->type, a->co, a->from, newState);
908+
}
909+
#ifdef NOT_USED /* see comment above */
910+
else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
881911
{
882912
/* With not too many arcs, just do them one at a time */
883913
struct arc *a;
@@ -944,6 +974,7 @@ copyins(struct nfa *nfa,
944974
createarc(nfa, a->type, a->co, a->from, newState);
945975
}
946976
}
977+
#endif /* NOT_USED */
947978
}
948979

949980
/*
@@ -1058,7 +1089,18 @@ moveouts(struct nfa *nfa,
10581089
{
10591090
assert(oldState != newState);
10601091

1061-
if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1092+
if (newState->nouts == 0)
1093+
{
1094+
/* No need for de-duplication */
1095+
struct arc *a;
1096+
1097+
while ((a = oldState->outs) != NULL)
1098+
{
1099+
createarc(nfa, a->type, a->co, newState, a->to);
1100+
freearc(nfa, a);
1101+
}
1102+
}
1103+
else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
10621104
{
10631105
/* With not too many arcs, just do them one at a time */
10641106
struct arc *a;
@@ -1142,15 +1184,27 @@ moveouts(struct nfa *nfa,
11421184

11431185
/*
11441186
* copyouts - copy out arcs of a state to another state
1187+
*
1188+
* See comments for copyins()
11451189
*/
11461190
static void
11471191
copyouts(struct nfa *nfa,
11481192
struct state *oldState,
11491193
struct state *newState)
11501194
{
11511195
assert(oldState != newState);
1196+
assert(newState->nouts == 0); /* see comment above */
1197+
1198+
if (newState->nouts == 0)
1199+
{
1200+
/* No need for de-duplication */
1201+
struct arc *a;
11521202

1153-
if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1203+
for (a = oldState->outs; a != NULL; a = a->outchain)
1204+
createarc(nfa, a->type, a->co, newState, a->to);
1205+
}
1206+
#ifdef NOT_USED /* see comment above */
1207+
else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
11541208
{
11551209
/* With not too many arcs, just do them one at a time */
11561210
struct arc *a;
@@ -1217,6 +1271,7 @@ copyouts(struct nfa *nfa,
12171271
createarc(nfa, a->type, a->co, newState, a->to);
12181272
}
12191273
}
1274+
#endif /* NOT_USED */
12201275
}
12211276

12221277
/*
@@ -1975,6 +2030,7 @@ combine(struct nfa *nfa,
19752030
else if (a->co == RAINBOW)
19762031
{
19772032
/* con is incompatible if it's for a pseudocolor */
2033+
/* (this is hypothetical; we make no such constraints today) */
19782034
if (nfa->cm->cd[con->co].flags & PSEUDO)
19792035
return INCOMPATIBLE;
19802036
/* otherwise, constraint constrains arc to be only its color */
@@ -2001,6 +2057,7 @@ combine(struct nfa *nfa,
20012057
else if (a->co == RAINBOW)
20022058
{
20032059
/* con is incompatible if it's for a pseudocolor */
2060+
/* (this is hypothetical; we make no such constraints today) */
20042061
if (nfa->cm->cd[con->co].flags & PSEUDO)
20052062
return INCOMPATIBLE;
20062063
/* otherwise, constraint constrains arc to be only its color */
@@ -3562,6 +3619,7 @@ carc_cmp(const void *a, const void *b)
35623619
return -1;
35633620
if (aa->to > bb->to)
35643621
return +1;
3622+
/* This is unreached, since there should be no duplicate arcs now: */
35653623
return 0;
35663624
}
35673625

src/test/modules/test_regex/expected/test_regex.out

+120
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,34 @@ select * from test_regex('a[[=x=]]', 'az', '+Lb');
937937
{0,REG_ULOCALE}
938938
(1 row)
939939

940+
-- expectMatch 9.9b &iL {a[[=Y=]]} ay ay
941+
select * from test_regex('a[[=Y=]]', 'ay', 'iL');
942+
test_regex
943+
-----------------
944+
{0,REG_ULOCALE}
945+
{ay}
946+
(2 rows)
947+
948+
select * from test_regex('a[[=Y=]]', 'ay', 'iLb');
949+
test_regex
950+
-----------------
951+
{0,REG_ULOCALE}
952+
{ay}
953+
(2 rows)
954+
955+
-- expectNomatch 9.9c &L {a[[=Y=]]} ay
956+
select * from test_regex('a[[=Y=]]', 'ay', 'L');
957+
test_regex
958+
-----------------
959+
{0,REG_ULOCALE}
960+
(1 row)
961+
962+
select * from test_regex('a[[=Y=]]', 'ay', 'Lb');
963+
test_regex
964+
-----------------
965+
{0,REG_ULOCALE}
966+
(1 row)
967+
940968
-- expectError 9.10 & {a[0-[=x=]]} ERANGE
941969
select * from test_regex('a[0-[=x=]]', '', '');
942970
ERROR: invalid regular expression: invalid character range
@@ -2932,6 +2960,34 @@ select * from test_regex('a[^b-d]', 'aC', 'iMb');
29322960
{0,REG_UUNPORT}
29332961
(1 row)
29342962

2963+
-- expectMatch 19.6 &iM {a[B-Z]} aC aC
2964+
select * from test_regex('a[B-Z]', 'aC', 'iM');
2965+
test_regex
2966+
-----------------
2967+
{0,REG_UUNPORT}
2968+
{aC}
2969+
(2 rows)
2970+
2971+
select * from test_regex('a[B-Z]', 'aC', 'iMb');
2972+
test_regex
2973+
-----------------
2974+
{0,REG_UUNPORT}
2975+
{aC}
2976+
(2 rows)
2977+
2978+
-- expectNomatch 19.7 &iM {a[^B-Z]} aC
2979+
select * from test_regex('a[^B-Z]', 'aC', 'iM');
2980+
test_regex
2981+
-----------------
2982+
{0,REG_UUNPORT}
2983+
(1 row)
2984+
2985+
select * from test_regex('a[^B-Z]', 'aC', 'iMb');
2986+
test_regex
2987+
-----------------
2988+
{0,REG_UUNPORT}
2989+
(1 row)
2990+
29352991
-- doing 20 "directors and embedded options"
29362992
-- expectError 20.1 & ***? BADPAT
29372993
select * from test_regex('***?', '', '');
@@ -3850,6 +3906,14 @@ select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz',
38503906
{foo/bar/baz,foo,bar,baz}
38513907
(2 rows)
38523908

3909+
-- expectMatch 24.14 PRT {^(.+?)(?:/(.+?))(?:/(.+?)\3)?$} {foo/bar/baz/quux} {foo/bar/baz/quux} {foo} {bar/baz/quux} {}
3910+
select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT');
3911+
test_regex
3912+
----------------------------------------------
3913+
{3,REG_UBACKREF,REG_UNONPOSIX,REG_USHORTEST}
3914+
{foo/bar/baz/quux,foo,bar/baz/quux,NULL}
3915+
(2 rows)
3916+
38533917
-- doing 25 "mixed quantifiers"
38543918
-- # this is very incomplete as yet
38553919
-- # should include |
@@ -4926,3 +4990,59 @@ select * from test_regex('(\Y)+', 'foo', 'LNP');
49264990
{"",""}
49274991
(2 rows)
49284992

4993+
-- and now, tests not from either Spencer or the Tcl project
4994+
-- These cases exercise additional code paths in pushfwd()/push()/combine()
4995+
select * from test_regex('a\Y(?=45)', 'a45', 'HLP');
4996+
test_regex
4997+
-----------------------------------------------
4998+
{0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
4999+
{a}
5000+
(2 rows)
5001+
5002+
select * from test_regex('a(?=.)c', 'ac', 'HP');
5003+
test_regex
5004+
-----------------------------------
5005+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
5006+
{ac}
5007+
(2 rows)
5008+
5009+
select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP');
5010+
test_regex
5011+
-----------------------------------
5012+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
5013+
{azz33}
5014+
(2 rows)
5015+
5016+
select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP');
5017+
test_regex
5018+
-----------------------------------------------
5019+
{0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
5020+
{az3%}
5021+
(2 rows)
5022+
5023+
-- These exercise the bulk-arc-movement paths in moveins() and moveouts();
5024+
-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT()
5025+
select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)',
5026+
'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP');
5027+
test_regex
5028+
-------------------------------
5029+
{0,REG_UNONPOSIX,REG_ULOCALE}
5030+
{ABCDEFGHIJKLMNOPQRSTUVWXYZ3}
5031+
(2 rows)
5032+
5033+
select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+',
5034+
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP');
5035+
test_regex
5036+
-------------------------------------------
5037+
{1,REG_UNONPOSIX,REG_ULOCALE}
5038+
{ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,""}
5039+
(2 rows)
5040+
5041+
select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$',
5042+
'az', '');
5043+
test_regex
5044+
--------------
5045+
{2}
5046+
{az,az,NULL}
5047+
(2 rows)
5048+

0 commit comments

Comments
 (0)