Support PG_UNICODE_FAST locale in the builtin collation provider.

jeff-davis · jeff-davis · commit d3d098316913 · 2025-01-17T15:56:30.000-08:00
The PG_UNICODE_FAST locale uses code point sort order (fast, memcmp-based) combined with Unicode character semantics. The character semantics are based on Unicode full case mapping. Full case mapping can map a single codepoint to multiple codepoints, such as "ß" uppercasing to "SS". Additionally, it handles context-sensitive mappings like the "final sigma", and it uses titlecase mappings such as "ǅ" when titlecasing (rather than plain uppercase mappings). Importantly, the uppercasing of "ß" as "SS" is specifically mentioned by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics for case mapping and pattern matching, so if we changed it to use the PG_UNICODE_FAST locale, it would offer better compliance with the standard. For now, though, do not change the behavior of UCS_BASIC. Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
@@ -377,8 +377,9 @@ initdb --locale-provider=icu --icu-locale=en
      <listitem>
       <para>
        The <literal>builtin</literal> provider uses built-in operations. Only
-       the <literal>C</literal> and <literal>C.UTF-8</literal> locales are
-       supported for this provider.
+       the <literal>C</literal>, <literal>C.UTF-8</literal>, and
+       <literal>PG_UNICODE_FAST</literal> locales are supported for this
+       provider.
       </para>
       <para>
        The <literal>C</literal> locale behavior is identical to the
@@ -392,6 +393,13 @@ initdb --locale-provider=icu --icu-locale=en
        regular expression character classes are based on the "POSIX
        Compatible" semantics, and the case mapping is the "simple" variant.
       </para>
+      <para>
+       The <literal>PG_UNICODE_FAST</literal> locale is available only when
+       the database encoding is <literal>UTF-8</literal>, and the behavior is
+       based on Unicode. The collation uses the code point values only. The
+       regular expression character classes are based on the "Standard"
+       semantics, and the case mapping is the "full" variant.
+      </para>
      </listitem>
     </varlistentry>
 
@@ -886,6 +894,23 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR";
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><literal>pg_unicode_fast</literal></term>
+      <listitem>
+       <para>
+        This collation sorts by Unicode code point values rather than natural
+        language order.  For the functions <function>lower</function>,
+        <function>initcap</function>, and <function>upper</function> it uses
+        Unicode full case mapping. For pattern matching (including regular
+        expressions), it uses the Standard variant of Unicode <ulink
+        url="https://www.unicode.org/reports/tr18/#Compatibility_Properties">Compatibility
+        Properties</ulink>.  Behavior is efficient and stable within a
+        <productname>Postgres</productname> major version.  It is only
+        available for encoding <literal>UTF8</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><literal>pg_c_utf8</literal></term>
       <listitem>
diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml
@@ -99,7 +99,8 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replace
       <para>
        If <replaceable>provider</replaceable> is <literal>builtin</literal>,
        then <replaceable>locale</replaceable> must be specified and set to
-       either <literal>C</literal> or <literal>C.UTF-8</literal>.
+       either <literal>C</literal>, <literal>C.UTF-8</literal> or
+       <literal>PG_UNICODE_FAST</literal>.
       </para>
      </listitem>
     </varlistentry>
diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml
@@ -168,7 +168,8 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
         If <xref linkend="create-database-locale-provider"/> is
         <literal>builtin</literal>, then <replaceable>locale</replaceable> or
         <replaceable>builtin_locale</replaceable> must be specified and set to
-        either <literal>C</literal> or <literal>C.UTF-8</literal>.
+        either <literal>C</literal>, <literal>C.UTF-8</literal>, or
+        <literal>PG_UNICODE_FAST</literal>.
        </para>
        <tip>
         <para>
@@ -233,7 +234,8 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
        </para>
        <para>
         The locales available for the <literal>builtin</literal> provider are
-        <literal>C</literal> and <literal>C.UTF-8</literal>.
+        <literal>C</literal>, <literal>C.UTF-8</literal> and
+        <literal>PG_UNICODE_FAST</literal>.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml
@@ -295,8 +295,8 @@ PostgreSQL documentation
        <para>
         If <option>--locale-provider</option> is <literal>builtin</literal>,
         <option>--locale</option> or <option>--builtin-locale</option> must be
-        specified and set to <literal>C</literal> or
-        <literal>C.UTF-8</literal>.
+        specified and set to <literal>C</literal>, <literal>C.UTF-8</literal>
+        or <literal>PG_UNICODE_FAST</literal>.
        </para>
       </listitem>
      </varlistentry>
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
@@ -307,7 +307,7 @@ pg_wc_isdigit(pg_wchar c)
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISDIGIT));
 		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isdigit(c, true);
+			return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full);
 		case PG_REGEX_STRATEGY_LIBC_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
@@ -361,7 +361,7 @@ pg_wc_isalnum(pg_wchar c)
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISALNUM));
 		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalnum(c, true);
+			return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full);
 		case PG_REGEX_STRATEGY_LIBC_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
@@ -505,7 +505,7 @@ pg_wc_ispunct(pg_wchar c)
 			return (c <= (pg_wchar) 127 &&
 					(pg_char_properties[c] & PG_ISPUNCT));
 		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_ispunct(c, true);
+			return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full);
 		case PG_REGEX_STRATEGY_LIBC_WIDE:
 			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
 				return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
@@ -1590,8 +1590,11 @@ builtin_locale_encoding(const char *locale)
 {
 	if (strcmp(locale, "C") == 0)
 		return -1;
-	if (strcmp(locale, "C.UTF-8") == 0)
+	else if (strcmp(locale, "C.UTF-8") == 0)
 		return PG_UTF8;
+	else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
+		return PG_UTF8;
+
 
 	ereport(ERROR,
 			(errcode(ERRCODE_WRONG_OBJECT_TYPE),
@@ -1616,6 +1619,8 @@ builtin_validate_locale(int encoding, const char *locale)
 		canonical_name = "C";
 	else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)
 		canonical_name = "C.UTF-8";
+	else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
+		canonical_name = "PG_UNICODE_FAST";
 
 	if (!canonical_name)
 		ereport(ERROR,
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
@@ -78,7 +78,8 @@ size_t
 strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strlower(dest, destsize, src, srclen, false);
+	return unicode_strlower(dest, destsize, src, srclen,
+							locale->info.builtin.casemap_full);
 }
 
 size_t
@@ -93,15 +94,17 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 		.prev_alnum = false,
 	};
 
-	return unicode_strtitle(dest, destsize, src, srclen, false,
+	return unicode_strtitle(dest, destsize, src, srclen,
+							locale->info.builtin.casemap_full,
 							initcap_wbnext, &wbstate);
 }
 
 size_t
 strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 				 pg_locale_t locale)
 {
-	return unicode_strupper(dest, destsize, src, srclen, false);
+	return unicode_strupper(dest, destsize, src, srclen,
+							locale->info.builtin.casemap_full);
 }
 
 pg_locale_t
@@ -142,6 +145,7 @@ create_pg_locale_builtin(Oid collid, MemoryContext context)
 	result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
 
 	result->info.builtin.locale = MemoryContextStrdup(context, locstr);
+	result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
 	result->provider = COLLPROVIDER_BUILTIN;
 	result->deterministic = true;
 	result->collate_is_c = true;
@@ -164,6 +168,8 @@ get_collation_actual_version_builtin(const char *collcollate)
 		return "1";
 	else if (strcmp(collcollate, "C.UTF-8") == 0)
 		return "1";
+	else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
+		return "1";
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
@@ -2489,6 +2489,8 @@ setlocales(void)
 		else if (strcmp(datlocale, "C.UTF-8") == 0 ||
 				 strcmp(datlocale, "C.UTF8") == 0)
 			canonname = "C.UTF-8";
+		else if (strcmp(datlocale, "PG_UNICODE_FAST") == 0)
+			canonname = "PG_UNICODE_FAST";
 		else
 			pg_fatal("invalid locale name \"%s\" for builtin provider",
 					 datlocale);
@@ -2782,7 +2784,9 @@ setup_locale_encoding(void)
 
 	if (locale_provider == COLLPROVIDER_BUILTIN)
 	{
-		if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8)
+		if ((strcmp(datlocale, "C.UTF-8") == 0 ||
+			 strcmp(datlocale, "PG_UNICODE_FAST") == 0) &&
+			encodingid != PG_UTF8)
 			pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
 					 datlocale, "UTF-8");
 	}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202501162
+#define CATALOG_VERSION_NO	202501171
 
 #endif
diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat
@@ -33,5 +33,8 @@
   descr => 'sorts by Unicode code point; Unicode and POSIX character semantics',
   collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6',
   colllocale => 'C.UTF-8', collversion => '1' },
+{ oid => '9535', descr => 'sorts by Unicode code point; Unicode character semantics',
+  collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6',
+  colllocale => 'PG_UNICODE_FAST', collversion => '1' },
 
 ]
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
@@ -108,6 +108,7 @@ struct pg_locale_struct
 		struct
 		{
 			const char *locale;
+			bool		casemap_full;
 		}			builtin;
 		locale_t	lt;
 #ifdef USE_ICU
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
@@ -160,3 +160,163 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
  t
 (1 row)
 
+--
+-- Test PG_UNICODE_FAST
+--
+CREATE COLLATION regress_pg_unicode_fast (
+  provider = builtin, locale = 'unicode'); -- fails
+ERROR:  invalid locale name "unicode" for builtin provider
+CREATE COLLATION regress_pg_unicode_fast (
+  provider = builtin, locale = 'PG_UNICODE_FAST');
+CREATE TABLE test_pg_unicode_fast (
+  t TEXT COLLATE PG_UNICODE_FAST
+);
+INSERT INTO test_pg_unicode_fast VALUES
+  ('abc DEF 123abc'),
+  ('ábc sßs ßss DÉF'),
+  ('ǄxxǄ ǆxxǅ ǅxxǆ'),
+  ('ȺȺȺ'),
+  ('ⱥⱥⱥ'),
+  ('ⱥȺ');
+SELECT
+    t, lower(t), initcap(t), upper(t),
+    length(convert_to(t, 'UTF8')) AS t_bytes,
+    length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
+    length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
+    length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
+  FROM test_pg_unicode_fast;
+        t        |      lower      |     initcap      |       upper       | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes 
+-----------------+-----------------+------------------+-------------------+---------+---------------+-----------------+---------------
+ abc DEF 123abc  | abc def 123abc  | Abc Def 123abc   | ABC DEF 123ABC    |      14 |            14 |              14 |            14
+ ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF |      19 |            19 |              19 |            19
+ ǄxxǄ ǆxxǅ ǅxxǆ  | ǆxxǆ ǆxxǆ ǆxxǆ  | ǅxxǆ ǅxxǆ ǅxxǆ   | ǄXXǄ ǄXXǄ ǄXXǄ    |      20 |            20 |              20 |            20
+ ȺȺȺ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       6 |             9 |               8 |             6
+ ⱥⱥⱥ             | ⱥⱥⱥ             | Ⱥⱥⱥ              | ȺȺȺ               |       9 |             9 |               8 |             6
+ ⱥȺ              | ⱥⱥ              | Ⱥⱥ               | ȺȺ                |       5 |             6 |               5 |             4
+(6 rows)
+
+DROP TABLE test_pg_unicode_fast;
+-- test Final_Sigma
+SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3
+ lower 
+-------
+ ας
+(1 row)
+
+SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030
+ lower 
+-------
+ ας0
+(1 row)
+
+SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343
+ lower 
+-------
+ ἀς̓
+(1 row)
+
+SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345
+ lower 
+-------
+ ᾳςͅ
+(1 row)
+
+-- test !Final_Sigma
+SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3
+ lower 
+-------
+ σ
+(1 row)
+
+SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
+ lower 
+-------
+ 0σ
+(1 row)
+
+SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
+ lower 
+-------
+ ασα
+(1 row)
+
+SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
+ lower 
+-------
+ ἀσ̓α
+(1 row)
+
+SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+ lower 
+-------
+ ᾳσͅα
+(1 row)
+
+-- properties
+SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- case mapping
+SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed
+ ?column? 
+----------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql

Original file line number	Diff line number	Diff line change
`@@ -33,5 +33,8 @@`
`33`	`33`	`descr => 'sorts by Unicode code point; Unicode and POSIX character semantics',`
`34`	`34`	`collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6',`
`35`	`35`	`colllocale => 'C.UTF-8', collversion => '1' },`
	`36`	`+{ oid => '9535', descr => 'sorts by Unicode code point; Unicode character semantics',`
	`37`	`+ collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6',`
	`38`	`+ colllocale => 'PG_UNICODE_FAST', collversion => '1' },`
`36`	`39`
`37`	`40`	`]`
Original file line number	Diff line number	Diff line change
`@@ -108,6 +108,7 @@ struct pg_locale_struct`
`108`	`108`	`struct`
`109`	`109`	`{`
`110`	`110`	`const char *locale;`
	`111`	`+ bool casemap_full;`
`111`	`112`	`} builtin;`
`112`	`113`	`locale_t lt;`
`113`	`114`	`#ifdef USE_ICU`