diff options
author | Tom Lane | 2021-08-17 16:00:02 +0000 |
---|---|---|
committer | Tom Lane | 2021-08-17 16:00:02 +0000 |
commit | 78a843f119ca7d4a6eb173a7ee3bed45889d48d8 (patch) | |
tree | f87ef8ebaf0daa5064dd19df9e942bd22d8872b3 /src/backend/regex | |
parent | f576de1db1eeca63180b1ffa4b42b1e360f88577 (diff) |
Improve regex compiler's arc moving/copying logic.
The functions moveins(), copyins(), moveouts(), copyouts() are
required to preserve the invariant that there are no duplicate arcs in
the regex's NFA. Spencer's original implementation of them was O(N^2)
since it checked separately for a match to each source arc. In commit
579840ca0 I improved that by adding sort/merge logic to be used if
more than a few arcs are to be moved/copied. However, I now realize
that that missed a bet. At many call sites, the target state is newly
made and cannot have any existing in-arcs (respectively out-arcs)
that could be duplicates. So spending any cycles at all on checking
for duplicates is wasted effort; in these cases we can just blindly
move/copy all the source arcs. Add code paths to do that.
It turns out that for copyins()/copyouts(), *all* the call sites have
this property, making all the "improved" logic in them flat out
unreachable. Perhaps we'll need the full capability again someday,
so I just #ifdef'd those paths out rather than removing them entirely.
In passing, add a few test cases to improve code coverage in this
area as well as in regc_locale.c/regc_pg_locale.c.
Discussion: https://postgr.es/m/810272.1629064063@sss.pgh.pa.us
Diffstat (limited to 'src/backend/regex')
-rw-r--r-- | src/backend/regex/regc_nfa.c | 66 |
1 files changed, 62 insertions, 4 deletions
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 6d77c59e121..059cf64df07 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -777,6 +777,10 @@ sortouts_cmp(const void *a, const void *b) * However, if we have a whole lot of arcs to deal with, retail duplicate * checks become too slow. In that case we proceed by sorting and merging * the arc lists, and then we can indeed just update the arcs in-place. + * + * On the other hand, it's also true that this is frequently called with + * a brand-new newState that has no existing in-arcs. In that case, + * de-duplication is unnecessary, so we can just blindly move all the arcs. */ static void moveins(struct nfa *nfa, @@ -785,7 +789,18 @@ moveins(struct nfa *nfa, { assert(oldState != newState); - if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) + if (newState->nins == 0) + { + /* No need for de-duplication */ + struct arc *a; + + while ((a = oldState->ins) != NULL) + { + createarc(nfa, a->type, a->co, a->from, newState); + freearc(nfa, a); + } + } + else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) { /* With not too many arcs, just do them one at a time */ struct arc *a; @@ -869,6 +884,11 @@ moveins(struct nfa *nfa, /* * copyins - copy in arcs of a state to another state + * + * The comments for moveins() apply here as well. However, in current + * usage, this is *only* called with brand-new target states, so that + * only the "no need for de-duplication" code path is ever reached. + * We keep the rest #ifdef'd out in case it's needed in the future. */ static void copyins(struct nfa *nfa, @@ -876,8 +896,18 @@ copyins(struct nfa *nfa, struct state *newState) { assert(oldState != newState); + assert(newState->nins == 0); /* see comment above */ + + if (newState->nins == 0) + { + /* No need for de-duplication */ + struct arc *a; - if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) + for (a = oldState->ins; a != NULL; a = a->inchain) + createarc(nfa, a->type, a->co, a->from, newState); + } +#ifdef NOT_USED /* see comment above */ + else if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins)) { /* With not too many arcs, just do them one at a time */ struct arc *a; @@ -944,6 +974,7 @@ copyins(struct nfa *nfa, createarc(nfa, a->type, a->co, a->from, newState); } } +#endif /* NOT_USED */ } /* @@ -1058,7 +1089,18 @@ moveouts(struct nfa *nfa, { assert(oldState != newState); - if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) + if (newState->nouts == 0) + { + /* No need for de-duplication */ + struct arc *a; + + while ((a = oldState->outs) != NULL) + { + createarc(nfa, a->type, a->co, newState, a->to); + freearc(nfa, a); + } + } + else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) { /* With not too many arcs, just do them one at a time */ struct arc *a; @@ -1142,6 +1184,8 @@ moveouts(struct nfa *nfa, /* * copyouts - copy out arcs of a state to another state + * + * See comments for copyins() */ static void copyouts(struct nfa *nfa, @@ -1149,8 +1193,18 @@ copyouts(struct nfa *nfa, struct state *newState) { assert(oldState != newState); + assert(newState->nouts == 0); /* see comment above */ + + if (newState->nouts == 0) + { + /* No need for de-duplication */ + struct arc *a; - if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) + for (a = oldState->outs; a != NULL; a = a->outchain) + createarc(nfa, a->type, a->co, newState, a->to); + } +#ifdef NOT_USED /* see comment above */ + else if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts)) { /* With not too many arcs, just do them one at a time */ struct arc *a; @@ -1217,6 +1271,7 @@ copyouts(struct nfa *nfa, createarc(nfa, a->type, a->co, newState, a->to); } } +#endif /* NOT_USED */ } /* @@ -1975,6 +2030,7 @@ combine(struct nfa *nfa, else if (a->co == RAINBOW) { /* con is incompatible if it's for a pseudocolor */ + /* (this is hypothetical; we make no such constraints today) */ if (nfa->cm->cd[con->co].flags & PSEUDO) return INCOMPATIBLE; /* otherwise, constraint constrains arc to be only its color */ @@ -2001,6 +2057,7 @@ combine(struct nfa *nfa, else if (a->co == RAINBOW) { /* con is incompatible if it's for a pseudocolor */ + /* (this is hypothetical; we make no such constraints today) */ if (nfa->cm->cd[con->co].flags & PSEUDO) return INCOMPATIBLE; /* otherwise, constraint constrains arc to be only its color */ @@ -3562,6 +3619,7 @@ carc_cmp(const void *a, const void *b) return -1; if (aa->to > bb->to) return +1; + /* This is unreached, since there should be no duplicate arcs now: */ return 0; } |