Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 9e43e87

Browse files
committed
Fix contrib/pg_trgm's extraction of trigrams from regular expressions.
The logic for removing excess trigrams from the result was faulty. It intends to avoid merging the initial and final states of the NFA, which is necessary, but in testing whether removal of a specific trigram would cause that, it failed to consider the combined effects of all the state merges that that trigram's removal would cause. This could result in a broken final graph that would never match anything, leading to GIN or GiST indexscans not finding anything. To fix, add a "tentParent" field that is used only within this loop, and set it to show state merges that we are tentatively going to do. While examining a particular arc, we must chase up through tentParent links as well as regular parent links (the former can only appear atop the latter), and we must account for state init/fin flag merges that haven't actually been done yet. To simplify the latter, combine the separate init and fin bool fields into a bitmap flags field. I also chose to get rid of the "children" state list, which seems entirely inessential. Per bug #14563 from Alexey Isayko, which the added test cases are based on. Back-patch to 9.3 where this code was added. Report: https://postgr.es/m/20170222111446.1256.67547@wrigleys.postgresql.org Discussion: https://postgr.es/m/8816.1487787594@sss.pgh.pa.us
1 parent 502a383 commit 9e43e87

File tree

3 files changed

+91
-37
lines changed

3 files changed

+91
-37
lines changed

contrib/pg_trgm/expected/pg_trgm.out

+18
Original file line numberDiff line numberDiff line change
@@ -1202,6 +1202,12 @@ select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988
12021202
0.5 | qwertyu0987
12031203
(2 rows)
12041204

1205+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
1206+
count
1207+
-------
1208+
1000
1209+
(1 row)
1210+
12051211
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
12061212
set enable_seqscan=off;
12071213
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
@@ -2346,6 +2352,12 @@ select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988
23462352
0.5 | qwertyu0987
23472353
(2 rows)
23482354

2355+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
2356+
count
2357+
-------
2358+
1000
2359+
(1 row)
2360+
23492361
drop index trgm_idx;
23502362
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
23512363
set enable_seqscan=off;
@@ -3475,6 +3487,12 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
34753487
qwertyu0988 | 0.333333
34763488
(1 row)
34773489

3490+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
3491+
count
3492+
-------
3493+
1000
3494+
(1 row)
3495+
34783496
create table test2(t text COLLATE "C");
34793497
insert into test2 values ('abcdef');
34803498
insert into test2 values ('quark');

contrib/pg_trgm/sql/pg_trgm.sql

+3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098
2626
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
2727
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
2828
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
29+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
2930

3031
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
3132
set enable_seqscan=off;
@@ -36,6 +37,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
3637
explain (costs off)
3738
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
3839
select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988' limit 2;
40+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
3941

4042
drop index trgm_idx;
4143
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
@@ -44,6 +46,7 @@ set enable_seqscan=off;
4446
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
4547
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
4648
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
49+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
4750

4851
create table test2(t text COLLATE "C");
4952
insert into test2 values ('abcdef');

contrib/pg_trgm/trgm_regexp.c

+70-37
Original file line numberDiff line numberDiff line change
@@ -318,21 +318,22 @@ typedef struct
318318
* arcs - outgoing arcs of this state (List of TrgmArc)
319319
* enterKeys - enter keys reachable from this state without reading any
320320
* predictable trigram (List of TrgmStateKey)
321-
* fin - flag indicating this state is final
322-
* init - flag indicating this state is initial
321+
* flags - flag bits
323322
* parent - parent state, if this state has been merged into another
324-
* children - child states (states that have been merged into this one)
323+
* tentParent - planned parent state, if considering a merge
325324
* number - number of this state (used at the packaging stage)
326325
*/
326+
#define TSTATE_INIT 0x01 /* flag indicating this state is initial */
327+
#define TSTATE_FIN 0x02 /* flag indicating this state is final */
328+
327329
typedef struct TrgmState
328330
{
329331
TrgmStateKey stateKey; /* hashtable key: must be first field */
330332
List *arcs;
331333
List *enterKeys;
332-
bool fin;
333-
bool init;
334+
int flags;
334335
struct TrgmState *parent;
335-
List *children;
336+
struct TrgmState *tentParent;
336337
int number;
337338
} TrgmState;
338339

@@ -599,7 +600,7 @@ createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
599600
* get from the initial state to the final state without reading any
600601
* predictable trigram.
601602
*/
602-
if (trgmNFA.initState->fin)
603+
if (trgmNFA.initState->flags & TSTATE_FIN)
603604
return NULL;
604605

605606
/*
@@ -925,7 +926,7 @@ transformGraph(TrgmNFA *trgmNFA)
925926
initkey.nstate = pg_reg_getinitialstate(trgmNFA->regex);
926927

927928
initstate = getState(trgmNFA, &initkey);
928-
initstate->init = true;
929+
initstate->flags |= TSTATE_INIT;
929930
trgmNFA->initState = initstate;
930931

931932
/*
@@ -943,7 +944,7 @@ transformGraph(TrgmNFA *trgmNFA)
943944
* actual processing.
944945
*/
945946
if (trgmNFA->overflowed)
946-
state->fin = true;
947+
state->flags |= TSTATE_FIN;
947948
else
948949
processState(trgmNFA, state);
949950

@@ -968,7 +969,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
968969
* queue is empty. But we can quit if the state gets marked final.
969970
*/
970971
addKey(trgmNFA, state, &state->stateKey);
971-
while (trgmNFA->keysQueue != NIL && !state->fin)
972+
while (trgmNFA->keysQueue != NIL && !(state->flags & TSTATE_FIN))
972973
{
973974
TrgmStateKey *key = (TrgmStateKey *) linitial(trgmNFA->keysQueue);
974975

@@ -980,7 +981,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
980981
* Add outgoing arcs only if state isn't final (we have no interest in
981982
* outgoing arcs if we already match)
982983
*/
983-
if (!state->fin)
984+
if (!(state->flags & TSTATE_FIN))
984985
addArcs(trgmNFA, state);
985986
}
986987

@@ -989,7 +990,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
989990
* whether this should result in any further enter keys being added.
990991
* If so, add those keys to keysQueue so that processState will handle them.
991992
*
992-
* If the enter key is for the NFA's final state, set state->fin = TRUE.
993+
* If the enter key is for the NFA's final state, mark state as TSTATE_FIN.
993994
* This situation means that we can reach the final state from this expanded
994995
* state without reading any predictable trigram, so we must consider this
995996
* state as an accepting one.
@@ -1059,7 +1060,7 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
10591060
/* If state is now known final, mark it and we're done */
10601061
if (key->nstate == pg_reg_getfinalstate(trgmNFA->regex))
10611062
{
1062-
state->fin = true;
1063+
state->flags |= TSTATE_FIN;
10631064
return;
10641065
}
10651066

@@ -1385,10 +1386,9 @@ getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
13851386
/* New state: initialize and queue it */
13861387
state->arcs = NIL;
13871388
state->enterKeys = NIL;
1388-
state->init = false;
1389-
state->fin = false;
1389+
state->flags = 0;
13901390
state->parent = NULL;
1391-
state->children = NIL;
1391+
state->tentParent = NULL;
13921392
state->number = -1;
13931393

13941394
trgmNFA->queue = lappend(trgmNFA->queue, state);
@@ -1582,20 +1582,60 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
15821582
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
15831583
TrgmState *source = arcInfo->source,
15841584
*target = arcInfo->target;
1585+
int source_flags,
1586+
target_flags;
15851587

15861588
/* examine parent states, if any merging has already happened */
15871589
while (source->parent)
15881590
source = source->parent;
15891591
while (target->parent)
15901592
target = target->parent;
15911593

1592-
if ((source->init || target->init) &&
1593-
(source->fin || target->fin))
1594+
/* we must also consider merges we are planning right now */
1595+
source_flags = source->flags;
1596+
while (source->tentParent)
1597+
{
1598+
source = source->tentParent;
1599+
source_flags |= source->flags;
1600+
}
1601+
target_flags = target->flags;
1602+
while (target->tentParent)
1603+
{
1604+
target = target->tentParent;
1605+
target_flags |= target->flags;
1606+
}
1607+
1608+
/* would fully-merged state have both INIT and FIN set? */
1609+
if (((source_flags | target_flags) & (TSTATE_INIT | TSTATE_FIN)) ==
1610+
(TSTATE_INIT | TSTATE_FIN))
15941611
{
15951612
canRemove = false;
15961613
break;
15971614
}
1615+
1616+
/* ok so far, so remember planned merge */
1617+
if (source != target)
1618+
target->tentParent = source;
15981619
}
1620+
1621+
/* We must clear all the tentParent fields before continuing */
1622+
foreach(cell, trgmInfo->arcs)
1623+
{
1624+
TrgmArcInfo *arcInfo = (TrgmArcInfo *) lfirst(cell);
1625+
TrgmState *target = arcInfo->target;
1626+
TrgmState *ttarget;
1627+
1628+
while (target->parent)
1629+
target = target->parent;
1630+
1631+
while ((ttarget = target->tentParent) != NULL)
1632+
{
1633+
target->tentParent = NULL;
1634+
target = ttarget;
1635+
}
1636+
}
1637+
1638+
/* Now, move on if we can't drop this trigram */
15991639
if (!canRemove)
16001640
continue;
16011641

@@ -1611,7 +1651,12 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
16111651
while (target->parent)
16121652
target = target->parent;
16131653
if (source != target)
1654+
{
16141655
mergeStates(source, target);
1656+
/* Assert we didn't merge initial and final states */
1657+
Assert((source->flags & (TSTATE_INIT | TSTATE_FIN)) !=
1658+
(TSTATE_INIT | TSTATE_FIN));
1659+
}
16151660
}
16161661

16171662
/* Mark trigram unexpanded, and update totals */
@@ -1754,27 +1799,15 @@ fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
17541799
static void
17551800
mergeStates(TrgmState *state1, TrgmState *state2)
17561801
{
1757-
ListCell *cell;
1758-
17591802
Assert(state1 != state2);
17601803
Assert(!state1->parent);
17611804
Assert(!state2->parent);
17621805

1763-
/* state1 absorbs state2's init/fin flags */
1764-
state1->init |= state2->init;
1765-
state1->fin |= state2->fin;
1806+
/* state1 absorbs state2's flags */
1807+
state1->flags |= state2->flags;
17661808

1767-
/* state2, and all its children, become children of state1 */
1768-
foreach(cell, state2->children)
1769-
{
1770-
TrgmState *state = (TrgmState *) lfirst(cell);
1771-
1772-
state->parent = state1;
1773-
}
1809+
/* state2, and indirectly all its children, become children of state1 */
17741810
state2->parent = state1;
1775-
state1->children = list_concat(state1->children, state2->children);
1776-
state1->children = lappend(state1->children, state2);
1777-
state2->children = NIL;
17781811
}
17791812

17801813
/*
@@ -1843,9 +1876,9 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
18431876

18441877
if (state->number < 0)
18451878
{
1846-
if (state->init)
1879+
if (state->flags & TSTATE_INIT)
18471880
state->number = 0;
1848-
else if (state->fin)
1881+
else if (state->flags & TSTATE_FIN)
18491882
state->number = 1;
18501883
else
18511884
{
@@ -2109,9 +2142,9 @@ printTrgmNFA(TrgmNFA *trgmNFA)
21092142
ListCell *cell;
21102143

21112144
appendStringInfo(&buf, "s%p", (void *) state);
2112-
if (state->fin)
2145+
if (state->flags & TSTATE_FIN)
21132146
appendStringInfoString(&buf, " [shape = doublecircle]");
2114-
if (state->init)
2147+
if (state->flags & TSTATE_INIT)
21152148
initstate = state;
21162149
appendStringInfo(&buf, " [label = \"%d\"]", state->stateKey.nstate);
21172150
appendStringInfoString(&buf, ";\n");

0 commit comments

Comments
 (0)