Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d54ceb9

Browse files
committed
Adjust string comparison in jsonpath
We have implemented jsonpath string comparison using default database locale. However, standard requires us to compare Unicode codepoints. This commit implements that, but for performance reasons we still use per-byte comparison for "==" operator. Thus, for consistency other comparison operators do per-byte comparison if Unicode codepoints appear to be equal. In some edge cases, when same Unicode codepoints have different binary representations in database encoding, we diverge standard to achieve better performance of "==" operator. In future to implement strict standard conformance, we can do normalization of input JSON strings. Original patch was written by Nikita Glukhov, rewritten by me. Reported-by: Markus Winand Discussion: https://postgr.es/m/8B7FA3B4-328D-43D7-95A8-37B8891B8C78%40winand.at Author: Nikita Glukhov, Alexander Korotkov Backpatch-through: 12
1 parent cabe0f2 commit d54ceb9

File tree

3 files changed

+248
-3
lines changed

3 files changed

+248
-3
lines changed

src/backend/utils/adt/jsonpath_exec.c

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1980,6 +1980,73 @@ executeComparison(JsonPathItem *cmp, JsonbValue *lv, JsonbValue *rv, void *p)
19801980
return compareItems(cmp->type, lv, rv);
19811981
}
19821982

1983+
/*
1984+
* Perform per-byte comparison of two strings.
1985+
*/
1986+
static int
1987+
binaryCompareStrings(const char *s1, int len1,
1988+
const char *s2, int len2)
1989+
{
1990+
int cmp;
1991+
1992+
cmp = memcmp(s1, s2, Min(len1, len2));
1993+
1994+
if (cmp != 0)
1995+
return cmp;
1996+
1997+
if (len1 == len2)
1998+
return 0;
1999+
2000+
return len1 < len2 ? -1 : 1;
2001+
}
2002+
2003+
/*
2004+
* Compare two strings in the current server encoding using Unicode codepoint
2005+
* collation.
2006+
*/
2007+
static int
2008+
compareStrings(const char *mbstr1, int mblen1,
2009+
const char *mbstr2, int mblen2)
2010+
{
2011+
if (GetDatabaseEncoding() == PG_SQL_ASCII ||
2012+
GetDatabaseEncoding() == PG_UTF8)
2013+
{
2014+
/*
2015+
* It's known property of UTF-8 strings that their per-byte comparison
2016+
* result matches codepoints comparison result. ASCII can be
2017+
* considered as special case of UTF-8.
2018+
*/
2019+
return binaryCompareStrings(mbstr1, mblen1, mbstr2, mblen2);
2020+
}
2021+
else
2022+
{
2023+
/* We have to convert other encodings to UTF-8 first, then compare. */
2024+
char *utf8str1 = pg_server_to_any(mbstr1, mblen1, PG_UTF8),
2025+
*utf8str2 = pg_server_to_any(mbstr2, mblen2, PG_UTF8);
2026+
int cmp;
2027+
2028+
cmp = binaryCompareStrings(utf8str1, strlen(utf8str1),
2029+
utf8str2, strlen(utf8str2));
2030+
2031+
pfree(utf8str1);
2032+
pfree(utf8str2);
2033+
2034+
/*
2035+
* When all Unicode codepoints are equal, return result of binary
2036+
* comparison. In some edge cases, same characters may have different
2037+
* representations in encoding. Then our behavior could diverge from
2038+
* standard. However, that allow us to do simple binary comparison
2039+
* for "==" operator, which is performance critical in typical cases.
2040+
* In future to implement strict standard conformance, we can do
2041+
* normalization of input JSON strings.
2042+
*/
2043+
if (cmp == 0)
2044+
return binaryCompareStrings(mbstr1, mblen1, mbstr2, mblen2);
2045+
else
2046+
return cmp;
2047+
}
2048+
}
2049+
19832050
/*
19842051
* Compare two SQL/JSON items using comparison operation 'op'.
19852052
*/
@@ -2022,9 +2089,8 @@ compareItems(int32 op, JsonbValue *jb1, JsonbValue *jb2)
20222089
jb2->val.string.val,
20232090
jb1->val.string.len) ? jpbFalse : jpbTrue;
20242091

2025-
cmp = varstr_cmp(jb1->val.string.val, jb1->val.string.len,
2026-
jb2->val.string.val, jb2->val.string.len,
2027-
DEFAULT_COLLATION_OID);
2092+
cmp = compareStrings(jb1->val.string.val, jb1->val.string.len,
2093+
jb2->val.string.val, jb2->val.string.len);
20282094
break;
20292095

20302096
case jbvBinary:

src/test/regress/expected/jsonb_jsonpath.out

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1833,3 +1833,166 @@ SELECT jsonb_path_match('[{"a": 1}, {"a": 2}]', '$[*].a > 1');
18331833
t
18341834
(1 row)
18351835

1836+
-- test string comparison (Unicode codepoint collation)
1837+
WITH str(j, num) AS
1838+
(
1839+
SELECT jsonb_build_object('s', s), num
1840+
FROM unnest('{"", "a", "ab", "abc", "abcd", "b", "A", "AB", "ABC", "ABc", "ABcD", "B"}'::text[]) WITH ORDINALITY AS a(s, num)
1841+
)
1842+
SELECT
1843+
s1.j, s2.j,
1844+
jsonb_path_query_first(s1.j, '$.s < $s', vars => s2.j) lt,
1845+
jsonb_path_query_first(s1.j, '$.s <= $s', vars => s2.j) le,
1846+
jsonb_path_query_first(s1.j, '$.s == $s', vars => s2.j) eq,
1847+
jsonb_path_query_first(s1.j, '$.s >= $s', vars => s2.j) ge,
1848+
jsonb_path_query_first(s1.j, '$.s > $s', vars => s2.j) gt
1849+
FROM str s1, str s2
1850+
ORDER BY s1.num, s2.num;
1851+
j | j | lt | le | eq | ge | gt
1852+
---------------+---------------+-------+-------+-------+-------+-------
1853+
{"s": ""} | {"s": ""} | false | true | true | true | false
1854+
{"s": ""} | {"s": "a"} | true | true | false | false | false
1855+
{"s": ""} | {"s": "ab"} | true | true | false | false | false
1856+
{"s": ""} | {"s": "abc"} | true | true | false | false | false
1857+
{"s": ""} | {"s": "abcd"} | true | true | false | false | false
1858+
{"s": ""} | {"s": "b"} | true | true | false | false | false
1859+
{"s": ""} | {"s": "A"} | true | true | false | false | false
1860+
{"s": ""} | {"s": "AB"} | true | true | false | false | false
1861+
{"s": ""} | {"s": "ABC"} | true | true | false | false | false
1862+
{"s": ""} | {"s": "ABc"} | true | true | false | false | false
1863+
{"s": ""} | {"s": "ABcD"} | true | true | false | false | false
1864+
{"s": ""} | {"s": "B"} | true | true | false | false | false
1865+
{"s": "a"} | {"s": ""} | false | false | false | true | true
1866+
{"s": "a"} | {"s": "a"} | false | true | true | true | false
1867+
{"s": "a"} | {"s": "ab"} | true | true | false | false | false
1868+
{"s": "a"} | {"s": "abc"} | true | true | false | false | false
1869+
{"s": "a"} | {"s": "abcd"} | true | true | false | false | false
1870+
{"s": "a"} | {"s": "b"} | true | true | false | false | false
1871+
{"s": "a"} | {"s": "A"} | false | false | false | true | true
1872+
{"s": "a"} | {"s": "AB"} | false | false | false | true | true
1873+
{"s": "a"} | {"s": "ABC"} | false | false | false | true | true
1874+
{"s": "a"} | {"s": "ABc"} | false | false | false | true | true
1875+
{"s": "a"} | {"s": "ABcD"} | false | false | false | true | true
1876+
{"s": "a"} | {"s": "B"} | false | false | false | true | true
1877+
{"s": "ab"} | {"s": ""} | false | false | false | true | true
1878+
{"s": "ab"} | {"s": "a"} | false | false | false | true | true
1879+
{"s": "ab"} | {"s": "ab"} | false | true | true | true | false
1880+
{"s": "ab"} | {"s": "abc"} | true | true | false | false | false
1881+
{"s": "ab"} | {"s": "abcd"} | true | true | false | false | false
1882+
{"s": "ab"} | {"s": "b"} | true | true | false | false | false
1883+
{"s": "ab"} | {"s": "A"} | false | false | false | true | true
1884+
{"s": "ab"} | {"s": "AB"} | false | false | false | true | true
1885+
{"s": "ab"} | {"s": "ABC"} | false | false | false | true | true
1886+
{"s": "ab"} | {"s": "ABc"} | false | false | false | true | true
1887+
{"s": "ab"} | {"s": "ABcD"} | false | false | false | true | true
1888+
{"s": "ab"} | {"s": "B"} | false | false | false | true | true
1889+
{"s": "abc"} | {"s": ""} | false | false | false | true | true
1890+
{"s": "abc"} | {"s": "a"} | false | false | false | true | true
1891+
{"s": "abc"} | {"s": "ab"} | false | false | false | true | true
1892+
{"s": "abc"} | {"s": "abc"} | false | true | true | true | false
1893+
{"s": "abc"} | {"s": "abcd"} | true | true | false | false | false
1894+
{"s": "abc"} | {"s": "b"} | true | true | false | false | false
1895+
{"s": "abc"} | {"s": "A"} | false | false | false | true | true
1896+
{"s": "abc"} | {"s": "AB"} | false | false | false | true | true
1897+
{"s": "abc"} | {"s": "ABC"} | false | false | false | true | true
1898+
{"s": "abc"} | {"s": "ABc"} | false | false | false | true | true
1899+
{"s": "abc"} | {"s": "ABcD"} | false | false | false | true | true
1900+
{"s": "abc"} | {"s": "B"} | false | false | false | true | true
1901+
{"s": "abcd"} | {"s": ""} | false | false | false | true | true
1902+
{"s": "abcd"} | {"s": "a"} | false | false | false | true | true
1903+
{"s": "abcd"} | {"s": "ab"} | false | false | false | true | true
1904+
{"s": "abcd"} | {"s": "abc"} | false | false | false | true | true
1905+
{"s": "abcd"} | {"s": "abcd"} | false | true | true | true | false
1906+
{"s": "abcd"} | {"s": "b"} | true | true | false | false | false
1907+
{"s": "abcd"} | {"s": "A"} | false | false | false | true | true
1908+
{"s": "abcd"} | {"s": "AB"} | false | false | false | true | true
1909+
{"s": "abcd"} | {"s": "ABC"} | false | false | false | true | true
1910+
{"s": "abcd"} | {"s": "ABc"} | false | false | false | true | true
1911+
{"s": "abcd"} | {"s": "ABcD"} | false | false | false | true | true
1912+
{"s": "abcd"} | {"s": "B"} | false | false | false | true | true
1913+
{"s": "b"} | {"s": ""} | false | false | false | true | true
1914+
{"s": "b"} | {"s": "a"} | false | false | false | true | true
1915+
{"s": "b"} | {"s": "ab"} | false | false | false | true | true
1916+
{"s": "b"} | {"s": "abc"} | false | false | false | true | true
1917+
{"s": "b"} | {"s": "abcd"} | false | false | false | true | true
1918+
{"s": "b"} | {"s": "b"} | false | true | true | true | false
1919+
{"s": "b"} | {"s": "A"} | false | false | false | true | true
1920+
{"s": "b"} | {"s": "AB"} | false | false | false | true | true
1921+
{"s": "b"} | {"s": "ABC"} | false | false | false | true | true
1922+
{"s": "b"} | {"s": "ABc"} | false | false | false | true | true
1923+
{"s": "b"} | {"s": "ABcD"} | false | false | false | true | true
1924+
{"s": "b"} | {"s": "B"} | false | false | false | true | true
1925+
{"s": "A"} | {"s": ""} | false | false | false | true | true
1926+
{"s": "A"} | {"s": "a"} | true | true | false | false | false
1927+
{"s": "A"} | {"s": "ab"} | true | true | false | false | false
1928+
{"s": "A"} | {"s": "abc"} | true | true | false | false | false
1929+
{"s": "A"} | {"s": "abcd"} | true | true | false | false | false
1930+
{"s": "A"} | {"s": "b"} | true | true | false | false | false
1931+
{"s": "A"} | {"s": "A"} | false | true | true | true | false
1932+
{"s": "A"} | {"s": "AB"} | true | true | false | false | false
1933+
{"s": "A"} | {"s": "ABC"} | true | true | false | false | false
1934+
{"s": "A"} | {"s": "ABc"} | true | true | false | false | false
1935+
{"s": "A"} | {"s": "ABcD"} | true | true | false | false | false
1936+
{"s": "A"} | {"s": "B"} | true | true | false | false | false
1937+
{"s": "AB"} | {"s": ""} | false | false | false | true | true
1938+
{"s": "AB"} | {"s": "a"} | true | true | false | false | false
1939+
{"s": "AB"} | {"s": "ab"} | true | true | false | false | false
1940+
{"s": "AB"} | {"s": "abc"} | true | true | false | false | false
1941+
{"s": "AB"} | {"s": "abcd"} | true | true | false | false | false
1942+
{"s": "AB"} | {"s": "b"} | true | true | false | false | false
1943+
{"s": "AB"} | {"s": "A"} | false | false | false | true | true
1944+
{"s": "AB"} | {"s": "AB"} | false | true | true | true | false
1945+
{"s": "AB"} | {"s": "ABC"} | true | true | false | false | false
1946+
{"s": "AB"} | {"s": "ABc"} | true | true | false | false | false
1947+
{"s": "AB"} | {"s": "ABcD"} | true | true | false | false | false
1948+
{"s": "AB"} | {"s": "B"} | true | true | false | false | false
1949+
{"s": "ABC"} | {"s": ""} | false | false | false | true | true
1950+
{"s": "ABC"} | {"s": "a"} | true | true | false | false | false
1951+
{"s": "ABC"} | {"s": "ab"} | true | true | false | false | false
1952+
{"s": "ABC"} | {"s": "abc"} | true | true | false | false | false
1953+
{"s": "ABC"} | {"s": "abcd"} | true | true | false | false | false
1954+
{"s": "ABC"} | {"s": "b"} | true | true | false | false | false
1955+
{"s": "ABC"} | {"s": "A"} | false | false | false | true | true
1956+
{"s": "ABC"} | {"s": "AB"} | false | false | false | true | true
1957+
{"s": "ABC"} | {"s": "ABC"} | false | true | true | true | false
1958+
{"s": "ABC"} | {"s": "ABc"} | true | true | false | false | false
1959+
{"s": "ABC"} | {"s": "ABcD"} | true | true | false | false | false
1960+
{"s": "ABC"} | {"s": "B"} | true | true | false | false | false
1961+
{"s": "ABc"} | {"s": ""} | false | false | false | true | true
1962+
{"s": "ABc"} | {"s": "a"} | true | true | false | false | false
1963+
{"s": "ABc"} | {"s": "ab"} | true | true | false | false | false
1964+
{"s": "ABc"} | {"s": "abc"} | true | true | false | false | false
1965+
{"s": "ABc"} | {"s": "abcd"} | true | true | false | false | false
1966+
{"s": "ABc"} | {"s": "b"} | true | true | false | false | false
1967+
{"s": "ABc"} | {"s": "A"} | false | false | false | true | true
1968+
{"s": "ABc"} | {"s": "AB"} | false | false | false | true | true
1969+
{"s": "ABc"} | {"s": "ABC"} | false | false | false | true | true
1970+
{"s": "ABc"} | {"s": "ABc"} | false | true | true | true | false
1971+
{"s": "ABc"} | {"s": "ABcD"} | true | true | false | false | false
1972+
{"s": "ABc"} | {"s": "B"} | true | true | false | false | false
1973+
{"s": "ABcD"} | {"s": ""} | false | false | false | true | true
1974+
{"s": "ABcD"} | {"s": "a"} | true | true | false | false | false
1975+
{"s": "ABcD"} | {"s": "ab"} | true | true | false | false | false
1976+
{"s": "ABcD"} | {"s": "abc"} | true | true | false | false | false
1977+
{"s": "ABcD"} | {"s": "abcd"} | true | true | false | false | false
1978+
{"s": "ABcD"} | {"s": "b"} | true | true | false | false | false
1979+
{"s": "ABcD"} | {"s": "A"} | false | false | false | true | true
1980+
{"s": "ABcD"} | {"s": "AB"} | false | false | false | true | true
1981+
{"s": "ABcD"} | {"s": "ABC"} | false | false | false | true | true
1982+
{"s": "ABcD"} | {"s": "ABc"} | false | false | false | true | true
1983+
{"s": "ABcD"} | {"s": "ABcD"} | false | true | true | true | false
1984+
{"s": "ABcD"} | {"s": "B"} | true | true | false | false | false
1985+
{"s": "B"} | {"s": ""} | false | false | false | true | true
1986+
{"s": "B"} | {"s": "a"} | true | true | false | false | false
1987+
{"s": "B"} | {"s": "ab"} | true | true | false | false | false
1988+
{"s": "B"} | {"s": "abc"} | true | true | false | false | false
1989+
{"s": "B"} | {"s": "abcd"} | true | true | false | false | false
1990+
{"s": "B"} | {"s": "b"} | true | true | false | false | false
1991+
{"s": "B"} | {"s": "A"} | false | false | false | true | true
1992+
{"s": "B"} | {"s": "AB"} | false | false | false | true | true
1993+
{"s": "B"} | {"s": "ABC"} | false | false | false | true | true
1994+
{"s": "B"} | {"s": "ABc"} | false | false | false | true | true
1995+
{"s": "B"} | {"s": "ABcD"} | false | false | false | true | true
1996+
{"s": "B"} | {"s": "B"} | false | true | true | true | false
1997+
(144 rows)
1998+

src/test/regress/sql/jsonb_jsonpath.sql

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,3 +387,19 @@ SELECT jsonb_path_match('[true, true]', '$[*]', silent => false);
387387
SELECT jsonb '[{"a": 1}, {"a": 2}]' @@ '$[*].a > 1';
388388
SELECT jsonb '[{"a": 1}, {"a": 2}]' @@ '$[*].a > 2';
389389
SELECT jsonb_path_match('[{"a": 1}, {"a": 2}]', '$[*].a > 1');
390+
391+
-- test string comparison (Unicode codepoint collation)
392+
WITH str(j, num) AS
393+
(
394+
SELECT jsonb_build_object('s', s), num
395+
FROM unnest('{"", "a", "ab", "abc", "abcd", "b", "A", "AB", "ABC", "ABc", "ABcD", "B"}'::text[]) WITH ORDINALITY AS a(s, num)
396+
)
397+
SELECT
398+
s1.j, s2.j,
399+
jsonb_path_query_first(s1.j, '$.s < $s', vars => s2.j) lt,
400+
jsonb_path_query_first(s1.j, '$.s <= $s', vars => s2.j) le,
401+
jsonb_path_query_first(s1.j, '$.s == $s', vars => s2.j) eq,
402+
jsonb_path_query_first(s1.j, '$.s >= $s', vars => s2.j) ge,
403+
jsonb_path_query_first(s1.j, '$.s > $s', vars => s2.j) gt
404+
FROM str s1, str s2
405+
ORDER BY s1.num, s2.num;

0 commit comments

Comments
 (0)