@@ -593,40 +593,6 @@ hasnonemptyout(struct state * s)
593
593
return 0 ;
594
594
}
595
595
596
- /*
597
- * nonemptyouts - count non-EMPTY out arcs of a state
598
- */
599
- static int
600
- nonemptyouts (struct state * s )
601
- {
602
- int n = 0 ;
603
- struct arc * a ;
604
-
605
- for (a = s -> outs ; a != NULL ; a = a -> outchain )
606
- {
607
- if (a -> type != EMPTY )
608
- n ++ ;
609
- }
610
- return n ;
611
- }
612
-
613
- /*
614
- * nonemptyins - count non-EMPTY in arcs of a state
615
- */
616
- static int
617
- nonemptyins (struct state * s )
618
- {
619
- int n = 0 ;
620
- struct arc * a ;
621
-
622
- for (a = s -> ins ; a != NULL ; a = a -> inchain )
623
- {
624
- if (a -> type != EMPTY )
625
- n ++ ;
626
- }
627
- return n ;
628
- }
629
-
630
596
/*
631
597
* findarc - find arc, if any, from given source with given type and color
632
598
* If there is more than one such arc, the result is random.
@@ -1856,6 +1822,12 @@ fixempties(struct nfa * nfa,
1856
1822
struct state * nexts ;
1857
1823
struct arc * a ;
1858
1824
struct arc * nexta ;
1825
+ int totalinarcs ;
1826
+ struct arc * * inarcsorig ;
1827
+ struct arc * * arcarray ;
1828
+ int arccount ;
1829
+ int prevnins ;
1830
+ int nskip ;
1859
1831
1860
1832
/*
1861
1833
* First, get rid of any states whose sole out-arc is an EMPTY, since
@@ -1896,41 +1868,131 @@ fixempties(struct nfa * nfa,
1896
1868
dropstate (nfa , s );
1897
1869
}
1898
1870
1871
+ if (NISERR ())
1872
+ return ;
1873
+
1899
1874
/*
1900
- * For each remaining NFA state, find all other states that are reachable
1901
- * from it by a chain of one or more EMPTY arcs. Then generate new arcs
1875
+ * For each remaining NFA state, find all other states from which it is
1876
+ * reachable by a chain of one or more EMPTY arcs. Then generate new arcs
1902
1877
* that eliminate the need for each such chain.
1903
1878
*
1904
- * If we just do this straightforwardly, the algorithm gets slow in
1905
- * complex graphs, because the same arcs get copied to all intermediate
1906
- * states of an EMPTY chain, and then uselessly pushed repeatedly to the
1907
- * chain's final state; we waste a lot of time in newarc's duplicate
1908
- * checking. To improve matters, we decree that any state with only EMPTY
1909
- * out-arcs is "doomed" and will not be part of the final NFA. That can be
1910
- * ensured by not adding any new out-arcs to such a state. Having ensured
1911
- * that, we need not update the state's in-arcs list either; all arcs that
1912
- * might have gotten pushed forward to it will just get pushed directly to
1913
- * successor states. This eliminates most of the useless duplicate arcs.
1879
+ * We could replace a chain of EMPTY arcs that leads from a "from" state
1880
+ * to a "to" state either by pushing non-EMPTY arcs forward (linking
1881
+ * directly from "from"'s predecessors to "to") or by pulling them back
1882
+ * (linking directly from "from" to "to"'s successors). We choose to
1883
+ * always do the former; this choice is somewhat arbitrary, but the
1884
+ * approach below requires that we uniformly do one or the other.
1885
+ *
1886
+ * Suppose we have a chain of N successive EMPTY arcs (where N can easily
1887
+ * approach the size of the NFA). All of the intermediate states must
1888
+ * have additional inarcs and outarcs, else they'd have been removed by
1889
+ * the steps above. Assuming their inarcs are mostly not empties, we will
1890
+ * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
1891
+ * state in the chain must be duplicated to lead to all its successor
1892
+ * states as well. So there is no hope of doing less than O(N^2) work;
1893
+ * however, we should endeavor to keep the big-O cost from being even
1894
+ * worse than that, which it can easily become without care. In
1895
+ * particular, suppose we were to copy all S1's inarcs forward to S2, and
1896
+ * then also to S3, and then later we consider pushing S2's inarcs forward
1897
+ * to S3. If we include the arcs already copied from S1 in that, we'd be
1898
+ * doing O(N^3) work. (The duplicate-arc elimination built into newarc()
1899
+ * and its cohorts would get rid of the extra arcs, but not without cost.)
1900
+ *
1901
+ * We can avoid this cost by treating only arcs that existed at the start
1902
+ * of this phase as candidates to be pushed forward. To identify those,
1903
+ * we remember the first inarc each state had to start with. We rely on
1904
+ * the fact that newarc() and friends put new arcs on the front of their
1905
+ * to-states' inchains, and that this phase never deletes arcs, so that
1906
+ * the original arcs must be the last arcs in their to-states' inchains.
1907
+ *
1908
+ * So the process here is that, for each state in the NFA, we gather up
1909
+ * all non-EMPTY inarcs of states that can reach the target state via
1910
+ * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the
1911
+ * target state's inchain. (We can safely use sort-merge for this as long
1912
+ * as we update each state's original-arcs pointer after we add arcs to
1913
+ * it; the sort step of mergeins probably changed the order of the old
1914
+ * arcs.)
1915
+ *
1916
+ * Another refinement worth making is that, because we only add non-EMPTY
1917
+ * arcs during this phase, and all added arcs have the same from-state as
1918
+ * the non-EMPTY arc they were cloned from, we know ahead of time that any
1919
+ * states having only EMPTY outarcs will be useless for lack of outarcs
1920
+ * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if
1921
+ * they had none to start with.) So we need not bother to update the
1922
+ * inchains of such states at all.
1923
+ */
1924
+
1925
+ /* Remember the states' first original inarcs */
1926
+ /* ... and while at it, count how many old inarcs there are altogether */
1927
+ inarcsorig = (struct arc * * ) MALLOC (nfa -> nstates * sizeof (struct arc * ));
1928
+ if (inarcsorig == NULL )
1929
+ {
1930
+ NERR (REG_ESPACE );
1931
+ return ;
1932
+ }
1933
+ totalinarcs = 0 ;
1934
+ for (s = nfa -> states ; s != NULL ; s = s -> next )
1935
+ {
1936
+ inarcsorig [s -> no ] = s -> ins ;
1937
+ totalinarcs += s -> nins ;
1938
+ }
1939
+
1940
+ /*
1941
+ * Create a workspace for accumulating the inarcs to be added to the
1942
+ * current target state. totalinarcs is probably a considerable
1943
+ * overestimate of the space needed, but the NFA is unlikely to be large
1944
+ * enough at this point to make it worth being smarter.
1914
1945
*/
1946
+ arcarray = (struct arc * * ) MALLOC (totalinarcs * sizeof (struct arc * ));
1947
+ if (arcarray == NULL )
1948
+ {
1949
+ NERR (REG_ESPACE );
1950
+ FREE (inarcsorig );
1951
+ return ;
1952
+ }
1953
+
1954
+ /* And iterate over the target states */
1915
1955
for (s = nfa -> states ; s != NULL && !NISERR (); s = s -> next )
1916
1956
{
1917
- for (s2 = emptyreachable (nfa , s , s ); s2 != s && !NISERR (); s2 = nexts )
1957
+ /* Ignore target states without non-EMPTY outarcs, per note above */
1958
+ if (!s -> flag && !hasnonemptyout (s ))
1959
+ continue ;
1960
+
1961
+ /* Find predecessor states and accumulate their original inarcs */
1962
+ arccount = 0 ;
1963
+ for (s2 = emptyreachable (nfa , s , s , inarcsorig ); s2 != s ; s2 = nexts )
1918
1964
{
1919
- /*
1920
- * If s2 is doomed, we decide that (1) we will always push arcs
1921
- * forward to it, not pull them back to s; and (2) we can optimize
1922
- * away the push-forward, per comment above. So do nothing.
1923
- */
1924
- if (s2 -> flag || hasnonemptyout (s2 ))
1925
- replaceempty (nfa , s , s2 );
1965
+ /* Add s2's original inarcs to arcarray[], but ignore empties */
1966
+ for (a = inarcsorig [s2 -> no ]; a != NULL ; a = a -> inchain )
1967
+ {
1968
+ if (a -> type != EMPTY )
1969
+ arcarray [arccount ++ ] = a ;
1970
+ }
1926
1971
1927
1972
/* Reset the tmp fields as we walk back */
1928
1973
nexts = s2 -> tmp ;
1929
1974
s2 -> tmp = NULL ;
1930
1975
}
1931
1976
s -> tmp = NULL ;
1977
+ assert (arccount <= totalinarcs );
1978
+
1979
+ /* Remember how many original inarcs this state has */
1980
+ prevnins = s -> nins ;
1981
+
1982
+ /* Add non-duplicate inarcs to target state */
1983
+ mergeins (nfa , s , arcarray , arccount );
1984
+
1985
+ /* Now we must update the state's inarcsorig pointer */
1986
+ nskip = s -> nins - prevnins ;
1987
+ a = s -> ins ;
1988
+ while (nskip -- > 0 )
1989
+ a = a -> inchain ;
1990
+ inarcsorig [s -> no ] = a ;
1932
1991
}
1933
1992
1993
+ FREE (arcarray );
1994
+ FREE (inarcsorig );
1995
+
1934
1996
if (NISERR ())
1935
1997
return ;
1936
1998
@@ -1964,20 +2026,25 @@ fixempties(struct nfa * nfa,
1964
2026
}
1965
2027
1966
2028
/*
1967
- * emptyreachable - recursively find all states reachable from s by EMPTY arcs
2029
+ * emptyreachable - recursively find all states that can reach s by EMPTY arcs
1968
2030
*
1969
2031
* The return value is the last such state found. Its tmp field links back
1970
2032
* to the next-to-last such state, and so on back to s, so that all these
1971
2033
* states can be located without searching the whole NFA.
1972
2034
*
2035
+ * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2036
+ * maintained by that function. This lets us skip over all new inarcs, which
2037
+ * are certainly not EMPTY arcs.
2038
+ *
1973
2039
* The maximum recursion depth here is equal to the length of the longest
1974
2040
* loop-free chain of EMPTY arcs, which is surely no more than the size of
1975
2041
* the NFA ... but that could still be enough to cause trouble.
1976
2042
*/
1977
2043
static struct state *
1978
2044
emptyreachable (struct nfa * nfa ,
1979
2045
struct state * s ,
1980
- struct state * lastfound )
2046
+ struct state * lastfound ,
2047
+ struct arc * * inarcsorig )
1981
2048
{
1982
2049
struct arc * a ;
1983
2050
@@ -1990,78 +2057,14 @@ emptyreachable(struct nfa * nfa,
1990
2057
1991
2058
s -> tmp = lastfound ;
1992
2059
lastfound = s ;
1993
- for (a = s -> outs ; a != NULL ; a = a -> outchain )
2060
+ for (a = inarcsorig [ s -> no ] ; a != NULL ; a = a -> inchain )
1994
2061
{
1995
- if (a -> type == EMPTY && a -> to -> tmp == NULL )
1996
- lastfound = emptyreachable (nfa , a -> to , lastfound );
2062
+ if (a -> type == EMPTY && a -> from -> tmp == NULL )
2063
+ lastfound = emptyreachable (nfa , a -> from , lastfound , inarcsorig );
1997
2064
}
1998
2065
return lastfound ;
1999
2066
}
2000
2067
2001
- /*
2002
- * replaceempty - replace an EMPTY arc chain with some non-empty arcs
2003
- *
2004
- * The EMPTY arc(s) should be deleted later, but we can't do it here because
2005
- * they may still be needed to identify other arc chains during fixempties().
2006
- */
2007
- static void
2008
- replaceempty (struct nfa * nfa ,
2009
- struct state * from ,
2010
- struct state * to )
2011
- {
2012
- int fromouts ;
2013
- int toins ;
2014
-
2015
- assert (from != to );
2016
-
2017
- /*
2018
- * Create replacement arcs that bypass the need for the EMPTY chain. We
2019
- * can do this either by pushing arcs forward (linking directly from
2020
- * "from"'s predecessors to "to") or by pulling them back (linking
2021
- * directly from "from" to "to"'s successors). In general, we choose
2022
- * whichever way creates greater fan-out or fan-in, so as to improve the
2023
- * odds of reducing the other state to zero in-arcs or out-arcs and
2024
- * thereby being able to delete it. However, if "from" is doomed (has no
2025
- * non-EMPTY out-arcs), we must keep it so, so always push forward in that
2026
- * case.
2027
- *
2028
- * The fan-out/fan-in comparison should count only non-EMPTY arcs. If
2029
- * "from" is doomed, we can skip counting "to"'s arcs, since we want to
2030
- * force taking the copyins path in that case.
2031
- */
2032
- fromouts = nonemptyouts (from );
2033
- toins = (fromouts == 0 ) ? 1 : nonemptyins (to );
2034
-
2035
- if (fromouts > toins )
2036
- {
2037
- copyouts (nfa , to , from , 0 );
2038
- return ;
2039
- }
2040
- if (fromouts < toins )
2041
- {
2042
- copyins (nfa , from , to , 0 );
2043
- return ;
2044
- }
2045
-
2046
- /*
2047
- * fromouts == toins. Decide on secondary issue: copy fewest arcs.
2048
- *
2049
- * Doesn't seem to be worth the trouble to exclude empties from these
2050
- * comparisons; that takes extra time and doesn't seem to improve the
2051
- * resulting graph much.
2052
- */
2053
- if (from -> nins > to -> nouts )
2054
- {
2055
- copyouts (nfa , to , from , 0 );
2056
- return ;
2057
- }
2058
- else
2059
- {
2060
- copyins (nfa , from , to , 0 );
2061
- return ;
2062
- }
2063
- }
2064
-
2065
2068
/*
2066
2069
* isconstraintarc - detect whether an arc is of a constraint type
2067
2070
*/
0 commit comments