Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit a70b18b

Browse files
committed
Fix regexport.c to behave sanely with lookaround constraints.
regexport.c thought it could just ignore LACON arcs, but the correct behavior is to treat them as satisfiable while consuming zero input (rather reminiscently of commit 9f1e642). Otherwise, the emitted simplified-NFA representation may contain no paths leading from initial to final state, which unsurprisingly confuses pg_trgm, as seen in bug #14623 from Jeff Janes. Since regexport's output representation has no concept of an arc that consumes zero input, recurse internally to find the next normal arc(s) after any LACON transitions. We'd be forced into changing that representation if a LACON could be the last arc reaching the final state, but fortunately the regex library never builds NFAs with such a configuration, so there always is a next normal arc. Back-patch to 9.3 where this logic was introduced. Discussion: https://postgr.es/m/20170413180503.25948.94871@wrigleys.postgresql.org
1 parent 5c63dab commit a70b18b

File tree

3 files changed

+82
-25
lines changed

3 files changed

+82
-25
lines changed

contrib/pg_trgm/expected/pg_trgm.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3676,6 +3676,12 @@ select * from test2 where t ~ ' z foo';
36763676
z foo bar
36773677
(1 row)
36783678

3679+
select * from test2 where t ~ 'qua(?!foo)';
3680+
t
3681+
-------
3682+
quark
3683+
(1 row)
3684+
36793685
drop index test2_idx_gin;
36803686
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
36813687
set enable_seqscan=off;
@@ -3856,6 +3862,12 @@ select * from test2 where t ~ ' z foo';
38563862
z foo bar
38573863
(1 row)
38583864

3865+
select * from test2 where t ~ 'qua(?!foo)';
3866+
t
3867+
-------
3868+
quark
3869+
(1 row)
3870+
38593871
-- Check similarity threshold (bug #14202)
38603872
CREATE TEMP TABLE restaurants (city text);
38613873
INSERT INTO restaurants SELECT 'Warsaw' FROM generate_series(1, 10000);

contrib/pg_trgm/sql/pg_trgm.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,9 @@ select * from test2 where t ~ 'z foo bar';
8181
select * from test2 where t ~ ' z foo bar';
8282
select * from test2 where t ~ ' z foo bar';
8383
select * from test2 where t ~ ' z foo';
84+
select * from test2 where t ~ 'qua(?!foo)';
8485
drop index test2_idx_gin;
86+
8587
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
8688
set enable_seqscan=off;
8789
explain (costs off)
@@ -116,6 +118,7 @@ select * from test2 where t ~ 'z foo bar';
116118
select * from test2 where t ~ ' z foo bar';
117119
select * from test2 where t ~ ' z foo bar';
118120
select * from test2 where t ~ ' z foo';
121+
select * from test2 where t ~ 'qua(?!foo)';
119122

120123
-- Check similarity threshold (bug #14202)
121124

src/backend/regex/regexport.c

Lines changed: 67 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
* In this implementation, the NFA defines a necessary but not sufficient
77
* condition for a string to match the regex: that is, there can be strings
88
* that match the NFA but don't match the full regex, but not vice versa.
9-
* Thus, for example, it is okay for the functions below to ignore lookaround
10-
* constraints, which merely constrain the string some more.
9+
* Thus, for example, it is okay for the functions below to treat lookaround
10+
* constraints as no-ops, since they merely constrain the string some more.
1111
*
1212
* Notice that these functions return info into caller-provided arrays
1313
* rather than doing their own malloc's. This simplifies the APIs by
@@ -28,6 +28,8 @@
2828

2929
#include "regex/regexport.h"
3030

31+
#include "miscadmin.h"
32+
3133
static void scancolormap(struct colormap * cm, int co,
3234
union tree * t, int level, chr partial,
3335
pg_wchar **chars, int *chars_len);
@@ -76,29 +78,78 @@ pg_reg_getfinalstate(const regex_t *regex)
7678
}
7779

7880
/*
79-
* Get number of outgoing NFA arcs of state number "st".
81+
* pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON
82+
* arcs from the caller, treating any LACON as being automatically satisfied.
83+
* Since the output representation does not support arcs that consume no
84+
* character when traversed, we have to recursively traverse LACON arcs here,
85+
* and report whatever normal arcs are reachable by traversing LACON arcs.
86+
* Note that this wouldn't work if it were possible to reach the final state
87+
* via LACON traversal, but the regex library never builds NFAs that have
88+
* LACON arcs leading directly to the final state. (This is because the
89+
* regex executor is designed to consume one character beyond the nominal
90+
* match end --- possibly an EOS indicator --- so there is always a set of
91+
* ordinary arcs leading to the final state.)
8092
*
81-
* Note: LACON arcs are ignored, both here and in pg_reg_getoutarcs().
93+
* traverse_lacons is a recursive subroutine used by both exported functions
94+
* to count and then emit the reachable regular arcs. *arcs_count is
95+
* incremented by the number of reachable arcs, and as many as will fit in
96+
* arcs_len (possibly 0) are emitted into arcs[].
97+
*/
98+
static void
99+
traverse_lacons(struct cnfa * cnfa, int st,
100+
int *arcs_count,
101+
regex_arc_t *arcs, int arcs_len)
102+
{
103+
struct carc *ca;
104+
105+
/*
106+
* Since this function recurses, it could theoretically be driven to stack
107+
* overflow. In practice, this is mostly useful to backstop against a
108+
* failure of the regex compiler to remove a loop of LACON arcs.
109+
*/
110+
check_stack_depth();
111+
112+
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
113+
{
114+
if (ca->co < cnfa->ncolors)
115+
{
116+
/* Ordinary arc, so count and possibly emit it */
117+
int ndx = (*arcs_count)++;
118+
119+
if (ndx < arcs_len)
120+
{
121+
arcs[ndx].co = ca->co;
122+
arcs[ndx].to = ca->to;
123+
}
124+
}
125+
else
126+
{
127+
/* LACON arc --- assume it's satisfied and recurse... */
128+
/* ... but first, assert it doesn't lead directly to post state */
129+
Assert(ca->to != cnfa->post);
130+
131+
traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len);
132+
}
133+
}
134+
}
135+
136+
/*
137+
* Get number of outgoing NFA arcs of state number "st".
82138
*/
83139
int
84140
pg_reg_getnumoutarcs(const regex_t *regex, int st)
85141
{
86142
struct cnfa *cnfa;
87-
struct carc *ca;
88-
int count;
143+
int arcs_count;
89144

90145
assert(regex != NULL && regex->re_magic == REMAGIC);
91146
cnfa = &((struct guts *) regex->re_guts)->search;
92147

93148
if (st < 0 || st >= cnfa->nstates)
94149
return 0;
95-
count = 0;
96-
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
97-
{
98-
if (ca->co < cnfa->ncolors)
99-
count++;
100-
}
101-
return count;
150+
arcs_count = 0;
151+
traverse_lacons(cnfa, st, &arcs_count, NULL, 0);
152+
return arcs_count;
102153
}
103154

104155
/*
@@ -111,24 +162,15 @@ pg_reg_getoutarcs(const regex_t *regex, int st,
111162
regex_arc_t *arcs, int arcs_len)
112163
{
113164
struct cnfa *cnfa;
114-
struct carc *ca;
165+
int arcs_count;
115166

116167
assert(regex != NULL && regex->re_magic == REMAGIC);
117168
cnfa = &((struct guts *) regex->re_guts)->search;
118169

119170
if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
120171
return;
121-
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
122-
{
123-
if (ca->co < cnfa->ncolors)
124-
{
125-
arcs->co = ca->co;
126-
arcs->to = ca->to;
127-
arcs++;
128-
if (--arcs_len == 0)
129-
break;
130-
}
131-
}
172+
arcs_count = 0;
173+
traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len);
132174
}
133175

134176
/*

0 commit comments

Comments
 (0)