postgrespro
diff --git a/‎contrib/bloom/bloom.h
Lines changed: 1 addition & 0 deletions b/‎contrib/bloom/bloom.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎contrib/bloom/blutils.c
Lines changed: 2 additions & 1 deletion b/‎contrib/bloom/blutils.c
Lines changed: 2 additions & 1 deletion
diff --git a/‎doc/src/sgml/catalogs.sgml
Lines changed: 7 additions & 0 deletions b/‎doc/src/sgml/catalogs.sgml
Lines changed: 7 additions & 0 deletions
diff --git a/‎doc/src/sgml/charset.sgml
Lines changed: 56 additions & 5 deletions b/‎doc/src/sgml/charset.sgml
Lines changed: 56 additions & 5 deletions
diff --git a/‎doc/src/sgml/citext.sgml
Lines changed: 21 additions & 0 deletions b/‎doc/src/sgml/citext.sgml
Lines changed: 21 additions & 0 deletions
diff --git a/‎doc/src/sgml/func.sgml
Lines changed: 6 additions & 0 deletions b/‎doc/src/sgml/func.sgml
Lines changed: 6 additions & 0 deletions
diff --git a/‎doc/src/sgml/ref/create_collation.sgml
Lines changed: 22 additions & 0 deletions b/‎doc/src/sgml/ref/create_collation.sgml
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/backend/access/hash/hashfunc.c
Lines changed: 89 additions & 11 deletions b/‎src/backend/access/hash/hashfunc.c
Lines changed: 89 additions & 11 deletions
diff --git a/‎src/backend/access/spgist/spgtextproc.c
Lines changed: 2 additions & 1 deletion b/‎src/backend/access/spgist/spgtextproc.c
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/backend/catalog/pg_collation.c
Lines changed: 2 additions & 0 deletions b/‎src/backend/catalog/pg_collation.c
Lines changed: 2 additions & 0 deletions
@@ -137,6 +137,7 @@ typedef struct BloomMetaPageData
 typedef struct BloomState
 {
 	FmgrInfo	hashFn[INDEX_MAX_KEYS];
+	Oid			collations[INDEX_MAX_KEYS];
 	BloomOptions opts;			/* copy of options on index's metapage */
 	int32		nColumns;
 
 
@@ -163,6 +163,7 @@ initBloomState(BloomState *state, Relation index)
 		fmgr_info_copy(&(state->hashFn[i]),
 					   index_getprocinfo(index, i + 1, BLOOM_HASH_PROC),
 					   CurrentMemoryContext);
+		state->collations[i] = index->rd_indcollation[i];
 	}
 
 	/* Initialize amcache if needed with options from metapage */
@@ -267,7 +268,7 @@ signValue(BloomState *state, BloomSignatureWord *sign, Datum value, int attno)
 	 * different columns will be mapped into different bits because of step
 	 * above
 	 */
-	hashVal = DatumGetInt32(FunctionCall1(&state->hashFn[attno], value));
+	hashVal = DatumGetInt32(FunctionCall1Coll(&state->hashFn[attno], state->collations[attno], value));
 	mySrand(hashVal ^ myRand());
 
 	for (j = 0; j < state->opts.bitSize[attno]; j++)
 
@@ -2077,6 +2077,13 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        default, <literal>c</literal> = libc, <literal>i</literal> = icu</entry>
      </row>
 
+     <row>
+      <entry><structfield>collisdeterministic</structfield></entry>
+      <entry><type>bool</type></entry>
+      <entry></entry>
+      <entry>Is the collation deterministic?</entry>
+     </row>
+
      <row>
       <entry><structfield>collencoding</structfield></entry>
       <entry><type>int4</type></entry>
 
@@ -847,11 +847,13 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE');
 
    <para>
     Note that while this system allows creating collations that <quote>ignore
-    case</quote> or <quote>ignore accents</quote> or similar (using
-    the <literal>ks</literal> key), PostgreSQL does not at the moment allow
-    such collations to act in a truly case- or accent-insensitive manner.  Any
-    strings that compare equal according to the collation but are not
-    byte-wise equal will be sorted according to their byte values.
+    case</quote> or <quote>ignore accents</quote> or similar (using the
+    <literal>ks</literal> key), in order for such collations to act in a
+    truly case- or accent-insensitive manner, they also need to be declared as not
+    <firstterm>deterministic</firstterm> in <command>CREATE COLLATION</command>;
+    see <xref linkend="collation-nondeterministic"/>.
+    Otherwise, any strings that compare equal according to the collation but
+    are not byte-wise equal will be sorted according to their byte values.
    </para>
 
    <note>
@@ -883,6 +885,55 @@ CREATE COLLATION french FROM "fr-x-icu";
    </para>
    </sect4>
    </sect3>
+
+   <sect3 id="collation-nondeterministic">
+    <title>Nondeterminstic Collations</title>
+
+    <para>
+     A collation is either <firstterm>deterministic</firstterm> or
+     <firstterm>nondeterministic</firstterm>.  A deterministic collation uses
+     deterministic comparisons, which means that it considers strings to be
+     equal only if they consist of the same byte sequence.  Nondeterministic
+     comparison may determine strings to be equal even if they consist of
+     different bytes.  Typical situations include case-insensitive comparison,
+     accent-insensitive comparison, as well as comparion of strings in
+     different Unicode normal forms.  It is up to the collation provider to
+     actually implement such insensitive comparisons; the deterministic flag
+     only determines whether ties are to be broken using bytewise comparison.
+     See also <ulink url="https://unicode.org/reports/tr10">Unicode Technical
+     Standard 10</ulink> for more information on the terminology.
+    </para>
+
+    <para>
+     To create a nondeterministic collation, specify the property
+     <literal>deterministic = false</literal> to <command>CREATE
+     COLLATION</command>, for example:
+<programlisting>
+CREATE COLLATION ndcoll (provider = icu, locale = 'und', deterministic = false);
+</programlisting>
+     This example would use the standard Unicode collation in a
+     nondeterministic way.  In particular, this would allow strings in
+     different normal forms to be compared correctly.  More interesting
+     examples make use of the ICU customization facilities explained above.
+     For example:
+<programlisting>
+CREATE COLLATION case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
+CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-true', deterministic = false);
+</programlisting>
+    </para>
+
+    <para>
+     All standard and predefined collations are deterministic, all
+     user-defined collations are deterministic by default.  While
+     nondeterministic collations give a more <quote>correct</quote> behavior,
+     especially when considering the full power of Unicode and its many
+     special cases, they also have some drawbacks.  Foremost, their use leads
+     to a performance penalty.  Also, certain operations are not possible with
+     nondeterministic collations, such as pattern matching operations.
+     Therefore, they should be used only in cases where they are specifically
+     wanted.
+    </para>
+   </sect3>
   </sect2>
  </sect1>
 
 
@@ -14,6 +14,16 @@
   exactly like <type>text</type>.
  </para>
 
+ <tip>
+  <para>
+   Consider using <firstterm>nondeterministic collations</firstterm> (see
+   <xref linkend="collation-nondeterministic"/>) instead of this module.  They
+   can be used for case-insensitive comparisons, accent-insensitive
+   comparisons, and other combinations, and they handle more Unicode special
+   cases correctly.
+  </para>
+ </tip>
+
  <sect2>
   <title>Rationale</title>
 
@@ -246,6 +256,17 @@ SELECT * FROM users WHERE nick = 'Larry';
       will be invoked instead.
     </para>
     </listitem>
+
+    <listitem>
+     <para>
+      The approach of lower-casing strings for comparison does not handle some
+      Unicode special cases correctly, for example when one upper-case letter
+      has two lower-case letter equivalents.  Unicode distinguishes between
+      <firstterm>case mapping</firstterm> and <firstterm>case
+      folding</firstterm> for this reason.  Use nondeterministic collations
+      instead of <type>citext</type> to handle that correctly.
+     </para>
+    </listitem>
    </itemizedlist>
  </sect2>
 
 
@@ -4065,6 +4065,12 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
     </para>
    </caution>
 
+   <para>
+    The pattern matching operators of all three kinds do not support
+    nondeterministic collations.  If required, apply a different collation to
+    the expression to work around this limitation.
+   </para>
+
   <sect2 id="functions-like">
    <title><function>LIKE</function></title>
 
 
@@ -23,6 +23,7 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> (
     [ LC_COLLATE = <replaceable>lc_collate</replaceable>, ]
     [ LC_CTYPE = <replaceable>lc_ctype</replaceable>, ]
     [ PROVIDER = <replaceable>provider</replaceable>, ]
+    [ DETERMINISTIC = <replaceable>boolean</replaceable>, ]
     [ VERSION = <replaceable>version</replaceable> ]
 )
 CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replaceable>existing_collation</replaceable>
@@ -124,6 +125,27 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replace
      </listitem>
     </varlistentry>
 
+    <varlistentry>
+     <term><literal>DETERMINISTIC</literal></term>
+
+     <listitem>
+      <para>
+       Specifies whether the collation should use deterministic comparisons.
+       The default is true.  A deterministic comparison considers strings that
+       are not byte-wise equal to be unequal even if they are considered
+       logically equal by the comparison.  PostgreSQL breaks ties using a
+       byte-wise comparison.  Comparison that is not deterministic can make the
+       collation be, say, case- or accent-insensitive.  For that, you need to
+       choose an appropriate <literal>LC_COLLATE</literal> setting
+       <emphasis>and</emphasis> set the collation to not deterministic here.
+      </para>
+
+      <para>
+       Nondeterministic collations are only supported with the ICU provider.
+      </para>
+     </listitem>
+    </varlistentry>
+
     <varlistentry>
      <term><replaceable>version</replaceable></term>
 
 
@@ -27,8 +27,10 @@
 #include "postgres.h"
 
 #include "access/hash.h"
+#include "catalog/pg_collation.h"
 #include "utils/builtins.h"
 #include "utils/hashutils.h"
+#include "utils/pg_locale.h"
 
 /*
  * Datatype-specific hash functions.
@@ -243,15 +245,51 @@ Datum
 hashtext(PG_FUNCTION_ARGS)
 {
 	text	   *key = PG_GETARG_TEXT_PP(0);
+	Oid			collid = PG_GET_COLLATION();
+	pg_locale_t	mylocale = 0;
 	Datum		result;
 
-	/*
-	 * Note: this is currently identical in behavior to hashvarlena, but keep
-	 * it as a separate function in case we someday want to do something
-	 * different in non-C locales.  (See also hashbpchar, if so.)
-	 */
-	result = hash_any((unsigned char *) VARDATA_ANY(key),
-					  VARSIZE_ANY_EXHDR(key));
+	if (!collid)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDETERMINATE_COLLATION),
+				 errmsg("could not determine which collation to use for string hashing"),
+				 errhint("Use the COLLATE clause to set the collation explicitly.")));
+
+	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
+		mylocale = pg_newlocale_from_collation(collid);
+
+	if (!mylocale || mylocale->deterministic)
+	{
+		result = hash_any((unsigned char *) VARDATA_ANY(key),
+						  VARSIZE_ANY_EXHDR(key));
+	}
+	else
+	{
+#ifdef USE_ICU
+		if (mylocale->provider == COLLPROVIDER_ICU)
+		{
+			int32_t		ulen = -1;
+			UChar	   *uchar = NULL;
+			Size		bsize;
+			uint8_t	   *buf;
+
+			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
+
+			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
+									uchar, ulen, NULL, 0);
+			buf = palloc(bsize);
+			ucol_getSortKey(mylocale->info.icu.ucol,
+							uchar, ulen, buf, bsize);
+
+			result = hash_any(buf, bsize);
+
+			pfree(buf);
+		}
+		else
+#endif
+			/* shouldn't happen */
+			elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
+	}
 
 	/* Avoid leaking memory for toasted inputs */
 	PG_FREE_IF_COPY(key, 0);
@@ -263,12 +301,52 @@ Datum
 hashtextextended(PG_FUNCTION_ARGS)
 {
 	text	   *key = PG_GETARG_TEXT_PP(0);
+	Oid			collid = PG_GET_COLLATION();
+	pg_locale_t	mylocale = 0;
 	Datum		result;
 
-	/* Same approach as hashtext */
-	result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
-							   VARSIZE_ANY_EXHDR(key),
-							   PG_GETARG_INT64(1));
+	if (!collid)
+		ereport(ERROR,
+				(errcode(ERRCODE_INDETERMINATE_COLLATION),
+				 errmsg("could not determine which collation to use for string hashing"),
+				 errhint("Use the COLLATE clause to set the collation explicitly.")));
+
+	if (!lc_collate_is_c(collid) && collid != DEFAULT_COLLATION_OID)
+		mylocale = pg_newlocale_from_collation(collid);
+
+	if (!mylocale || mylocale->deterministic)
+	{
+		result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
+								   VARSIZE_ANY_EXHDR(key),
+								   PG_GETARG_INT64(1));
+	}
+	else
+	{
+#ifdef USE_ICU
+		if (mylocale->provider == COLLPROVIDER_ICU)
+		{
+			int32_t		ulen = -1;
+			UChar	   *uchar = NULL;
+			Size		bsize;
+			uint8_t	   *buf;
+
+			ulen = icu_to_uchar(&uchar, VARDATA_ANY(key), VARSIZE_ANY_EXHDR(key));
+
+			bsize = ucol_getSortKey(mylocale->info.icu.ucol,
+									uchar, ulen, NULL, 0);
+			buf = palloc(bsize);
+			ucol_getSortKey(mylocale->info.icu.ucol,
+							uchar, ulen, buf, bsize);
+
+			result = hash_any_extended(buf, bsize, PG_GETARG_INT64(1));
+
+			pfree(buf);
+		}
+		else
+#endif
+			/* shouldn't happen */
+			elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
+	}
 
 	PG_FREE_IF_COPY(key, 0);
 
 
@@ -630,7 +630,8 @@ spg_text_leaf_consistent(PG_FUNCTION_ARGS)
 			 * query (prefix) string, so we don't need to check it again.
 			 */
 			res = (level >= queryLen) ||
-				DatumGetBool(DirectFunctionCall2(text_starts_with,
+				DatumGetBool(DirectFunctionCall2Coll(text_starts_with,
+													 PG_GET_COLLATION(),
 												 out->leafValue,
 												 PointerGetDatum(query)));
 
 
@@ -46,6 +46,7 @@ Oid
 CollationCreate(const char *collname, Oid collnamespace,
 				Oid collowner,
 				char collprovider,
+				bool collisdeterministic,
 				int32 collencoding,
 				const char *collcollate, const char *collctype,
 				const char *collversion,
@@ -160,6 +161,7 @@ CollationCreate(const char *collname, Oid collnamespace,
 	values[Anum_pg_collation_collnamespace - 1] = ObjectIdGetDatum(collnamespace);
 	values[Anum_pg_collation_collowner - 1] = ObjectIdGetDatum(collowner);
 	values[Anum_pg_collation_collprovider - 1] = CharGetDatum(collprovider);
+	values[Anum_pg_collation_collisdeterministic - 1] = BoolGetDatum(collisdeterministic);
 	values[Anum_pg_collation_collencoding - 1] = Int32GetDatum(collencoding);
 	namestrcpy(&name_collate, collcollate);
 	values[Anum_pg_collation_collcollate - 1] = NameGetDatum(&name_collate);
Original file line number	Diff line number	Diff line change
`@@ -137,6 +137,7 @@ typedef struct BloomMetaPageData`
`137`	`137`	`typedef struct BloomState`
`138`	`138`	`{`
`139`	`139`	`FmgrInfo hashFn[INDEX_MAX_KEYS];`
	`140`	`+ Oid collations[INDEX_MAX_KEYS];`
`140`	`141`	`BloomOptions opts; /* copy of options on index's metapage */`
`141`	`142`	`int32 nColumns;`
`142`	`143`