Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c6aae30

Browse files
committed
Simplify and document regex library's compact-NFA representation.
The previous coding abused the first element of a cNFA state's arcs list to hold a per-state flag bit, which was confusing, undocumented, and not even particularly efficient. Get rid of that in favor of a separate "stflags" vector. Since there's only one bit in use, I chose to allocate a char per state; we could possibly replace this with a bitmap at some point, but that would make accesses a little slower. It's already about 8X smaller than before, so let's not get overly tense. Also document the representation better than it was before, which is to say not at all. This patch is a byproduct of investigations towards extracting a "fixed prefix" string from the compact-NFA representation of regex patterns. Might need to back-patch it if we decide to back-patch that fix, but for now it's just code cleanup so I'll just put it in HEAD.
1 parent a184e4d commit c6aae30

File tree

4 files changed

+44
-32
lines changed

4 files changed

+44
-32
lines changed

src/backend/regex/regc_nfa.c

+17-17
Original file line numberDiff line numberDiff line change
@@ -1330,14 +1330,16 @@ compact(struct nfa * nfa,
13301330
for (s = nfa->states; s != NULL; s = s->next)
13311331
{
13321332
nstates++;
1333-
narcs += 1 + s->nouts + 1;
1334-
/* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
1333+
narcs += s->nouts + 1; /* need one extra for endmarker */
13351334
}
13361335

1336+
cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
13371337
cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
13381338
cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
1339-
if (cnfa->states == NULL || cnfa->arcs == NULL)
1339+
if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
13401340
{
1341+
if (cnfa->stflags != NULL)
1342+
FREE(cnfa->stflags);
13411343
if (cnfa->states != NULL)
13421344
FREE(cnfa->states);
13431345
if (cnfa->arcs != NULL)
@@ -1359,9 +1361,8 @@ compact(struct nfa * nfa,
13591361
for (s = nfa->states; s != NULL; s = s->next)
13601362
{
13611363
assert((size_t) s->no < nstates);
1364+
cnfa->stflags[s->no] = 0;
13621365
cnfa->states[s->no] = ca;
1363-
ca->co = 0; /* clear and skip flags "arc" */
1364-
ca++;
13651366
first = ca;
13661367
for (a = s->outs; a != NULL; a = a->outchain)
13671368
switch (a->type)
@@ -1392,8 +1393,8 @@ compact(struct nfa * nfa,
13921393

13931394
/* mark no-progress states */
13941395
for (a = nfa->pre->outs; a != NULL; a = a->outchain)
1395-
cnfa->states[a->to->no]->co = 1;
1396-
cnfa->states[nfa->pre->no]->co = 1;
1396+
cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
1397+
cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
13971398
}
13981399

13991400
/*
@@ -1433,6 +1434,7 @@ freecnfa(struct cnfa * cnfa)
14331434
{
14341435
assert(cnfa->nstates != 0); /* not empty already */
14351436
cnfa->nstates = 0;
1437+
FREE(cnfa->stflags);
14361438
FREE(cnfa->states);
14371439
FREE(cnfa->arcs);
14381440
}
@@ -1617,7 +1619,7 @@ dumpcnfa(struct cnfa * cnfa,
16171619
fprintf(f, ", haslacons");
16181620
fprintf(f, "\n");
16191621
for (st = 0; st < cnfa->nstates; st++)
1620-
dumpcstate(st, cnfa->states[st], cnfa, f);
1622+
dumpcstate(st, cnfa, f);
16211623
fflush(f);
16221624
}
16231625
#endif
@@ -1629,22 +1631,20 @@ dumpcnfa(struct cnfa * cnfa,
16291631
*/
16301632
static void
16311633
dumpcstate(int st,
1632-
struct carc * ca,
16331634
struct cnfa * cnfa,
16341635
FILE *f)
16351636
{
1636-
int i;
1637+
struct carc * ca;
16371638
int pos;
16381639

1639-
fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
1640+
fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
16401641
pos = 1;
1641-
for (i = 1; ca[i].co != COLORLESS; i++)
1642+
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
16421643
{
1643-
if (ca[i].co < cnfa->ncolors)
1644-
fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
1644+
if (ca->co < cnfa->ncolors)
1645+
fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
16451646
else
1646-
fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors,
1647-
ca[i].to);
1647+
fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
16481648
if (pos == 5)
16491649
{
16501650
fprintf(f, "\n");
@@ -1653,7 +1653,7 @@ dumpcstate(int st,
16531653
else
16541654
pos++;
16551655
}
1656-
if (i == 1 || pos != 1)
1656+
if (ca == cnfa->states[st] || pos != 1)
16571657
fprintf(f, "\n");
16581658
fflush(f);
16591659
}

src/backend/regex/regcomp.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ static void dumparcs(struct state *, FILE *);
162162
static int dumprarcs(struct arc *, struct state *, FILE *, int);
163163
static void dumparc(struct arc *, struct state *, FILE *);
164164
static void dumpcnfa(struct cnfa *, FILE *);
165-
static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
165+
static void dumpcstate(int, struct cnfa *, FILE *);
166166
#endif
167167
/* === regc_cvec.c === */
168168
static struct cvec *newcvec(int, int);

src/backend/regex/rege_dfa.c

+5-6
Original file line numberDiff line numberDiff line change
@@ -457,14 +457,14 @@ miss(struct vars * v, /* used only for debug flags */
457457
gotstate = 0;
458458
for (i = 0; i < d->nstates; i++)
459459
if (ISBSET(css->states, i))
460-
for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
460+
for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
461461
if (ca->co == co)
462462
{
463463
BSET(d->work, ca->to);
464464
gotstate = 1;
465465
if (ca->to == cnfa->post)
466466
ispost = 1;
467-
if (!cnfa->states[ca->to]->co)
467+
if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
468468
noprogress = 0;
469469
FDEBUG(("%d -> %d\n", i, ca->to));
470470
}
@@ -475,10 +475,9 @@ miss(struct vars * v, /* used only for debug flags */
475475
dolacons = 0;
476476
for (i = 0; i < d->nstates; i++)
477477
if (ISBSET(d->work, i))
478-
for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
479-
ca++)
478+
for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
480479
{
481-
if (ca->co <= cnfa->ncolors)
480+
if (ca->co < cnfa->ncolors)
482481
continue; /* NOTE CONTINUE */
483482
sawlacons = 1;
484483
if (ISBSET(d->work, ca->to))
@@ -489,7 +488,7 @@ miss(struct vars * v, /* used only for debug flags */
489488
dolacons = 1;
490489
if (ca->to == cnfa->post)
491490
ispost = 1;
492-
if (!cnfa->states[ca->to]->co)
491+
if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
493492
noprogress = 0;
494493
FDEBUG(("%d :> %d\n", i, ca->to));
495494
}

src/include/regex/regguts.h

+21-8
Original file line numberDiff line numberDiff line change
@@ -279,15 +279,14 @@ struct state;
279279

280280
struct arc
281281
{
282-
int type;
283-
#define ARCFREE '\0'
282+
int type; /* 0 if free, else an NFA arc type code */
284283
color co;
285284
struct state *from; /* where it's from (and contained within) */
286285
struct state *to; /* where it's to */
287-
struct arc *outchain; /* *from's outs chain or free chain */
286+
struct arc *outchain; /* link in *from's outs chain or free chain */
288287
#define freechain outchain
289-
struct arc *inchain; /* *to's ins chain */
290-
struct arc *colorchain; /* color's arc chain */
288+
struct arc *inchain; /* link in *to's ins chain */
289+
struct arc *colorchain; /* link in color's arc chain */
291290
struct arc *colorchainRev; /* back-link in color's arc chain */
292291
};
293292

@@ -339,24 +338,38 @@ struct nfa
339338

340339
/*
341340
* definitions for compacted NFA
341+
*
342+
* The main space savings in a compacted NFA is from making the arcs as small
343+
* as possible. We store only the transition color and next-state number for
344+
* each arc. The list of out arcs for each state is an array beginning at
345+
* cnfa.states[statenumber], and terminated by a dummy carc struct with
346+
* co == COLORLESS.
347+
*
348+
* The non-dummy carc structs are of two types: plain arcs and LACON arcs.
349+
* Plain arcs just store the transition color number as "co". LACON arcs
350+
* store the lookahead constraint number plus cnfa.ncolors as "co". LACON
351+
* arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
342352
*/
343353
struct carc
344354
{
345355
color co; /* COLORLESS is list terminator */
346-
int to; /* state number */
356+
int to; /* next-state number */
347357
};
348358

349359
struct cnfa
350360
{
351361
int nstates; /* number of states */
352-
int ncolors; /* number of colors */
362+
int ncolors; /* number of colors (max color in use + 1) */
353363
int flags;
354-
#define HASLACONS 01 /* uses lookahead constraints */
364+
#define HASLACONS 01 /* uses lookahead constraints */
355365
int pre; /* setup state number */
356366
int post; /* teardown state number */
357367
color bos[2]; /* colors, if any, assigned to BOS and BOL */
358368
color eos[2]; /* colors, if any, assigned to EOS and EOL */
369+
char *stflags; /* vector of per-state flags bytes */
370+
#define CNFA_NOPROGRESS 01 /* flag bit for a no-progress state */
359371
struct carc **states; /* vector of pointers to outarc lists */
372+
/* states[n] are pointers into a single malloc'd array of arcs */
360373
struct carc *arcs; /* the area for the lists */
361374
};
362375

0 commit comments

Comments
 (0)