Don't install ICU collation keyword variants

petere · petere · commit 2bfd1b1ee562 · 2017-08-21T19:21:07.000-04:00
Users can still create them themselves.  Instead, document Unicode TR 35
collation options for ICU, so users can create all this themselves.

Reviewed-by: Peter Geoghegan &lt;pg@bowt.ie&gt;
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
@@ -664,13 +664,6 @@ SELECT a COLLATE "C" &lt; b COLLATE "POSIX" FROM test1;
       </listitem>
      </varlistentry>
 
-     <varlistentry>
-      <term><literal>de-u-co-phonebk-x-icu</literal></term>
-      <listitem>
-       <para>German collation, phone book variant</para>
-      </listitem>
-     </varlistentry>
-
      <varlistentry>
       <term><literal>de-AT-x-icu</literal></term>
       <listitem>
@@ -683,13 +676,6 @@ SELECT a COLLATE "C" &lt; b COLLATE "POSIX" FROM test1;
       </listitem>
      </varlistentry>
 
-     <varlistentry>
-      <term><literal>de-AT-u-co-phonebk-x-icu</literal></term>
-      <listitem>
-       <para>German collation for Austria, phone book variant</para>
-      </listitem>
-     </varlistentry>
-
      <varlistentry>
       <term><literal>und-x-icu</literal> (for <quote>undefined</quote>)</term>
       <listitem>
@@ -709,6 +695,90 @@ SELECT a COLLATE "C" &lt; b COLLATE "POSIX" FROM test1;
     will draw an error along the lines of <quote>collation "de-x-icu" for
     encoding "WIN874" does not exist</>.
    </para>
+
+   <para>
+    ICU allows collations to be customized beyond the basic language+country
+    set that is preloaded by <command>initdb</command>.  Users are encouraged
+    to define their own collation objects that make use of these facilities to
+    suit the sorting behavior to their requirements.  Here are some examples:
+
+    <variablelist>
+     <varlistentry>
+      <term><literal>CREATE COLLATION "de-u-co-phonebk-x-icu" (provider = icu, locale = 'de-u-co-phonebk')</literal></term>
+      <listitem>
+       <para>German collation with phone book collation type</para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><literal>CREATE COLLATION "und-u-co-emoji-x-icu" (provider = icu, locale = 'und-u-co-emoji')</literal></term>
+      <listitem>
+       <para>
+        Root collation with Emoji collation type, per Unicode Technical Standard #51
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><literal>CREATE COLLATION digitslast (provider = icu, locale = 'en-u-kr-latn-digit')</literal></term>
+      <listitem>
+       <para>
+        Sort digits after Latin letters.  (The default is digits before letters.)
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><literal>CREATE COLLATION upperfirst (provider = icu, locale = 'en-u-kf-upper')</literal></term>
+      <listitem>
+       <para>
+        Sort upper-case letters before lower-case letters.  (The default is
+        lower-case letters first.)
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><literal>CREATE COLLATION special (provider = icu, locale = 'en-u-kf-upper-kr-latn-digit')</literal></term>
+      <listitem>
+       <para>
+        Combines both of the above options.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><literal>CREATE COLLATION numeric (provider = icu, locale = 'en-u-kn-true')</literal></term>
+      <listitem>
+       <para>
+        Numeric ordering, sorts sequences of digits by their numeric value,
+        for example: <literal>A-21</literal> &lt; <literal>A-123</literal>
+        (also known as natural sort).
+       </para>
+      </listitem>
+     </varlistentry>
+    </variablelist>
+
+    See <ulink url="http://unicode.org/reports/tr35/tr35-collation.html">Unicode
+    Technical Standard #35</ulink>
+    and <ulink url="https://tools.ietf.org/html/bcp47">BCP 47</ulink> for
+    details.  The list of possible collation types (<literal>co</literal>
+    subtag) can be found in
+    the <ulink url="http://www.unicode.org/repos/cldr/trunk/common/bcp47/collation.xml">CLDR
+    repository</ulink>.
+    The <ulink url="https://ssl.icu-project.org/icu-bin/locexp">ICU Locale
+    Explorer</ulink> can be used to check the details of a particular locale
+    definition.
+   </para>
+
+   <para>
+    Note that while this system allows creating collations that <quote>ignore
+    case</quote> or <quote>ignore accents</quote> or similar (using
+    the <literal>ks</literal> key), PostgreSQL does not at the moment allow
+    such collations to act in a truly case- or accent-insensitive manner.  Any
+    strings that compare equal according to the collation but are not
+    byte-wise equal will be sorted according to their byte values.
+   </para>
    </sect4>
    </sect3>
 
diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
@@ -687,30 +687,11 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 		 */
 		for (i = -1; i < uloc_countAvailable(); i++)
 		{
-			/*
-			 * In ICU 4.2, ucol_getKeywordValuesForLocale() sometimes returns
-			 * values that will not be accepted by uloc_toLanguageTag().  Skip
-			 * loading keyword variants in that version.  (Both
-			 * ucol_getKeywordValuesForLocale() and uloc_toLanguageTag() are
-			 * new in ICU 4.2, so older versions are not supported at all.)
-			 *
-			 * XXX We have no information about ICU 4.3 through 4.7, but we
-			 * know the code below works with 4.8.
-			 */
-#if U_ICU_VERSION_MAJOR_NUM > 4 || (U_ICU_VERSION_MAJOR_NUM == 4 && U_ICU_VERSION_MINOR_NUM > 2)
-#define LOAD_ICU_KEYWORD_VARIANTS
-#endif
-
 			const char *name;
 			char	   *langtag;
 			char	   *icucomment;
 			const char *collcollate;
 			Oid			collid;
-#ifdef LOAD_ICU_KEYWORD_VARIANTS
-			UEnumeration *en;
-			UErrorCode	status;
-			const char *val;
-#endif
 
 			if (i == -1)
 				name = "";		/* ICU root locale */
@@ -744,58 +725,6 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 					CreateComments(collid, CollationRelationId, 0,
 								   icucomment);
 			}
-
-			/*
-			 * Add keyword variants, if enabled.
-			 */
-#ifdef LOAD_ICU_KEYWORD_VARIANTS
-			status = U_ZERO_ERROR;
-			en = ucol_getKeywordValuesForLocale("collation", name, TRUE, &status);
-			if (U_FAILURE(status))
-				ereport(ERROR,
-						(errmsg("could not get keyword values for locale \"%s\": %s",
-								name, u_errorName(status))));
-
-			status = U_ZERO_ERROR;
-			uenum_reset(en, &status);
-			while ((val = uenum_next(en, NULL, &status)))
-			{
-				char	   *localeid = psprintf("%s@collation=%s", name, val);
-
-				langtag = get_icu_language_tag(localeid);
-				collcollate = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : localeid;
-
-				/*
-				 * Be paranoid about not allowing any non-ASCII strings into
-				 * pg_collation
-				 */
-				if (!is_all_ascii(langtag) || !is_all_ascii(collcollate))
-					continue;
-
-				collid = CollationCreate(psprintf("%s-x-icu", langtag),
-										 nspid, GetUserId(),
-										 COLLPROVIDER_ICU, -1,
-										 collcollate, collcollate,
-										 get_collation_actual_version(COLLPROVIDER_ICU, collcollate),
-										 true, true);
-				if (OidIsValid(collid))
-				{
-					ncreated++;
-
-					CommandCounterIncrement();
-
-					icucomment = get_icu_locale_comment(localeid);
-					if (icucomment)
-						CreateComments(collid, CollationRelationId, 0,
-									   icucomment);
-				}
-			}
-			if (U_FAILURE(status))
-				ereport(ERROR,
-						(errmsg("could not get keyword values for locale \"%s\": %s",
-								name, u_errorName(status))));
-			uenum_close(en);
-#endif							/* LOAD_ICU_KEYWORD_VARIANTS */
 		}
 	}
 #endif							/* USE_ICU */