@@ -1750,11 +1750,226 @@ pg_utf8_verifychar(const unsigned char *s, int len)
1750
1750
return l ;
1751
1751
}
1752
1752
1753
+ /*
1754
+ * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1755
+ * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1756
+ * input byte and current state are used to compute an index into an array of
1757
+ * state transitions. Since the address of the next transition is dependent
1758
+ * on this computation, there is latency in executing the load instruction,
1759
+ * and the CPU is not kept busy.
1760
+ *
1761
+ * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1762
+ *
1763
+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1764
+ *
1765
+ * In a shift-based DFA, the input byte is an index into array of integers
1766
+ * whose bit pattern encodes the state transitions. To compute the next
1767
+ * state, we simply right-shift the integer by the current state and apply a
1768
+ * mask. In this scheme, the address of the transition only depends on the
1769
+ * input byte, so there is better pipelining.
1770
+ *
1771
+ * The naming convention for states and transitions was adopted from a UTF-8
1772
+ * to UTF-16/32 transcoder, whose table is reproduced below:
1773
+ *
1774
+ * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1775
+ *
1776
+ * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1777
+ * ==========================================================================
1778
+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1779
+ * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1780
+ * |
1781
+ * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1782
+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1783
+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1784
+ * |
1785
+ * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1786
+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1787
+ * |
1788
+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1789
+ * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1790
+ *
1791
+ * In the most straightforward implementation, a shift-based DFA for UTF-8
1792
+ * requires 64-bit integers to encode the transitions, but with an SMT solver
1793
+ * it's possible to find state numbers such that the transitions fit within
1794
+ * 32-bit integers, as Dougall Johnson demonstrated:
1795
+ *
1796
+ * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1797
+ *
1798
+ * This packed representation is the reason for the seemingly odd choice of
1799
+ * state values below.
1800
+ */
1801
+
1802
+ /* Error */
1803
+ #define ERR 0
1804
+ /* Begin */
1805
+ #define BGN 11
1806
+ /* Continuation states, expect 1/2/3 continuation bytes */
1807
+ #define CS1 16
1808
+ #define CS2 1
1809
+ #define CS3 5
1810
+ /* Leading byte was E0/ED, expect 1 more continuation byte */
1811
+ #define P3A 6
1812
+ #define P3B 20
1813
+ /* Leading byte was F0/F4, expect 2 more continuation bytes */
1814
+ #define P4A 25
1815
+ #define P4B 30
1816
+ /* Begin and End are the same state */
1817
+ #define END BGN
1818
+
1819
+ /* the encoded state transitions for the lookup table */
1820
+
1821
+ /* ASCII */
1822
+ #define ASC (END << BGN)
1823
+ /* 2-byte lead */
1824
+ #define L2A (CS1 << BGN)
1825
+ /* 3-byte lead */
1826
+ #define L3A (P3A << BGN)
1827
+ #define L3B (CS2 << BGN)
1828
+ #define L3C (P3B << BGN)
1829
+ /* 4-byte lead */
1830
+ #define L4A (P4A << BGN)
1831
+ #define L4B (CS3 << BGN)
1832
+ #define L4C (P4B << BGN)
1833
+ /* continuation byte */
1834
+ #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1835
+ #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1836
+ #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1837
+ /* invalid byte */
1838
+ #define ILL ERR
1839
+
1840
+ static const uint32 Utf8Transition [256 ] =
1841
+ {
1842
+ /* ASCII */
1843
+
1844
+ ILL , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1845
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1846
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1847
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1848
+
1849
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1850
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1851
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1852
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1853
+
1854
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1855
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1856
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1857
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1858
+
1859
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1860
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1861
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1862
+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1863
+
1864
+ /* continuation bytes */
1865
+
1866
+ /* 80..8F */
1867
+ CR1 , CR1 , CR1 , CR1 , CR1 , CR1 , CR1 , CR1 ,
1868
+ CR1 , CR1 , CR1 , CR1 , CR1 , CR1 , CR1 , CR1 ,
1869
+
1870
+ /* 90..9F */
1871
+ CR2 , CR2 , CR2 , CR2 , CR2 , CR2 , CR2 , CR2 ,
1872
+ CR2 , CR2 , CR2 , CR2 , CR2 , CR2 , CR2 , CR2 ,
1873
+
1874
+ /* A0..BF */
1875
+ CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 ,
1876
+ CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 ,
1877
+ CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 ,
1878
+ CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 ,
1879
+
1880
+ /* leading bytes */
1881
+
1882
+ /* C0..DF */
1883
+ ILL , ILL , L2A , L2A , L2A , L2A , L2A , L2A ,
1884
+ L2A , L2A , L2A , L2A , L2A , L2A , L2A , L2A ,
1885
+ L2A , L2A , L2A , L2A , L2A , L2A , L2A , L2A ,
1886
+ L2A , L2A , L2A , L2A , L2A , L2A , L2A , L2A ,
1887
+
1888
+ /* E0..EF */
1889
+ L3A , L3B , L3B , L3B , L3B , L3B , L3B , L3B ,
1890
+ L3B , L3B , L3B , L3B , L3B , L3C , L3B , L3B ,
1891
+
1892
+ /* F0..FF */
1893
+ L4A , L4B , L4B , L4B , L4C , ILL , ILL , ILL ,
1894
+ ILL , ILL , ILL , ILL , ILL , ILL , ILL , ILL
1895
+ };
1896
+
1897
+ static void
1898
+ utf8_advance (const unsigned char * s , uint32 * state , int len )
1899
+ {
1900
+ /* Note: We deliberately don't check the state's value here. */
1901
+ while (len > 0 )
1902
+ {
1903
+ /*
1904
+ * It's important that the mask value is 31: In most instruction sets,
1905
+ * a shift by a 32-bit operand is understood to be a shift by its mod
1906
+ * 32, so the compiler should elide the mask operation.
1907
+ */
1908
+ * state = Utf8Transition [* s ++ ] >> (* state & 31 );
1909
+ len -- ;
1910
+ }
1911
+
1912
+ * state &= 31 ;
1913
+ }
1914
+
1753
1915
static int
1754
1916
pg_utf8_verifystr (const unsigned char * s , int len )
1755
1917
{
1756
1918
const unsigned char * start = s ;
1919
+ const int orig_len = len ;
1920
+ uint32 state = BGN ;
1921
+
1922
+ /*
1923
+ * Sixteen seems to give the best balance of performance across different
1924
+ * byte distributions.
1925
+ */
1926
+ #define STRIDE_LENGTH 16
1927
+
1928
+ if (len >= STRIDE_LENGTH )
1929
+ {
1930
+ while (len >= STRIDE_LENGTH )
1931
+ {
1932
+ /*
1933
+ * If the chunk is all ASCII, we can skip the full UTF-8 check,
1934
+ * but we must first check for a non-END state, which means the
1935
+ * previous chunk ended in the middle of a multibyte sequence.
1936
+ */
1937
+ if (state != END || !is_valid_ascii (s , STRIDE_LENGTH ))
1938
+ utf8_advance (s , & state , STRIDE_LENGTH );
1939
+
1940
+ s += STRIDE_LENGTH ;
1941
+ len -= STRIDE_LENGTH ;
1942
+ }
1943
+
1944
+ /*
1945
+ * The error state persists, so we only need to check for it here. In
1946
+ * case of error we start over from the beginning with the slow path
1947
+ * so we can count the valid bytes.
1948
+ */
1949
+ if (state == ERR )
1950
+ {
1951
+ len = orig_len ;
1952
+ s = start ;
1953
+ }
1954
+
1955
+ /*
1956
+ * We treat all other states as success, but it's possible the fast
1957
+ * path exited in the middle of a multibyte sequence, since that
1958
+ * wouldn't have caused an error. Before checking the remaining bytes,
1959
+ * walk backwards to find the last byte that could have been the start
1960
+ * of a valid sequence.
1961
+ */
1962
+ while (s > start )
1963
+ {
1964
+ s -- ;
1965
+ len ++ ;
1966
+
1967
+ if (!IS_HIGHBIT_SET (* s ) || pg_utf_mblen (s ) > 1 )
1968
+ break ;
1969
+ }
1970
+ }
1757
1971
1972
+ /* check remaining bytes */
1758
1973
while (len > 0 )
1759
1974
{
1760
1975
int l ;
0 commit comments