postgrespro
diff --git a/‎doc/src/sgml/charset.sgml
Lines changed: 10 additions & 0 deletions b/‎doc/src/sgml/charset.sgml
Lines changed: 10 additions & 0 deletions
diff --git a/‎doc/src/sgml/func.sgml
Lines changed: 48 additions & 0 deletions b/‎doc/src/sgml/func.sgml
Lines changed: 48 additions & 0 deletions
diff --git a/‎src/backend/catalog/sql_features.txt
Lines changed: 1 addition & 1 deletion b/‎src/backend/catalog/sql_features.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/backend/catalog/system_views.sql
Lines changed: 15 additions & 0 deletions b/‎src/backend/catalog/system_views.sql
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/backend/parser/gram.y
Lines changed: 40 additions & 1 deletion b/‎src/backend/parser/gram.y
Lines changed: 40 additions & 1 deletion
diff --git a/‎src/backend/utils/adt/varlena.c
Lines changed: 150 additions & 0 deletions b/‎src/backend/utils/adt/varlena.c
Lines changed: 150 additions & 0 deletions
diff --git a/‎src/common/unicode/.gitignore
Lines changed: 1 addition & 0 deletions b/‎src/common/unicode/.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/common/unicode/Makefile
Lines changed: 6 additions & 3 deletions b/‎src/common/unicode/Makefile
Lines changed: 6 additions & 3 deletions
@@ -934,6 +934,16 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr
      such as pattern matching operations.  Therefore, they should be used
      only in cases where they are specifically wanted.
     </para>
+
+    <tip>
+     <para>
+      To deal with text in different Unicode normalization forms, it is also
+      an option to use the functions/expressions
+      <function>normalize</function> and <literal>is normalized</literal> to
+      preprocess or check the strings, instead of using nondeterministic
+      collations.  There are different trade-offs for each approach.
+     </para>
+    </tip>
    </sect3>
   </sect2>
  </sect1>
 
@@ -1560,6 +1560,30 @@
        <entry><literal>Value: 42</literal></entry>
       </row>
 
+      <row>
+       <entry>
+        <indexterm>
+         <primary>normalized</primary>
+        </indexterm>
+        <indexterm>
+         <primary>Unicode normalization</primary>
+        </indexterm>
+        <literal><parameter>string</parameter> is <optional>not</optional> <optional><parameter>form</parameter></optional> normalized</literal>
+       </entry>
+       <entry><type>boolean</type></entry>
+       <entry>
+        Checks whether the string is in the specified Unicode normalization
+        form.  The optional parameter specifies the form:
+        <literal>NFC</literal> (default), <literal>NFD</literal>,
+        <literal>NFKC</literal>, <literal>NFKD</literal>.  This expression can
+        only be used if the server encoding is <literal>UTF8</literal>.  Note
+        that checking for normalization using this expression is often faster
+        than normalizing possibly already normalized strings.
+       </entry>
+       <entry><literal>U&amp;'\0061\0308bc' IS NFD NORMALIZED</literal></entry>
+       <entry><literal>true</literal></entry>
+      </row>
+
       <row>
        <entry>
         <indexterm>
@@ -1610,6 +1634,30 @@
        <entry><literal>tom</literal></entry>
       </row>
 
+      <row>
+       <entry>
+        <indexterm>
+         <primary>normalize</primary>
+        </indexterm>
+        <indexterm>
+         <primary>Unicode normalization</primary>
+        </indexterm>
+        <literal><function>normalize(<parameter>string</parameter> <type>text</type>
+        <optional>, <parameter>form</parameter> </optional>)</function></literal>
+       </entry>
+       <entry><type>text</type></entry>
+       <entry>
+        Converts the string in the first argument to the specified Unicode
+        normalization form.  The optional second argument specifies the form
+        as an identifier: <literal>NFC</literal> (default),
+        <literal>NFD</literal>, <literal>NFKC</literal>,
+        <literal>NFKD</literal>.  This function can only be used if the server
+        encoding is <literal>UTF8</literal>.
+       </entry>
+       <entry><literal>normalize(U&amp;'\0061\0308bc', NFC)</literal></entry>
+       <entry><literal>U&amp;'\00E4bc'</literal></entry>
+      </row>
+
       <row>
        <entry>
         <indexterm>
 
@@ -257,7 +257,7 @@ F386	Set identity column generation clause			YES
 F391	Long identifiers			YES	
 F392	Unicode escapes in identifiers			YES	
 F393	Unicode escapes in literals			YES	
-F394	Optional normal form specification			NO	
+F394	Optional normal form specification			YES	
 F401	Extended joined table			YES	
 F401	Extended joined table	01	NATURAL JOIN	YES	
 F401	Extended joined table	02	FULL OUTER JOIN	YES	
 
@@ -1400,6 +1400,21 @@ LANGUAGE INTERNAL
 STRICT STABLE PARALLEL SAFE
 AS 'jsonb_path_query_first_tz';
 
+-- default normalization form is NFC, per SQL standard
+CREATE OR REPLACE FUNCTION
+  "normalize"(text, text DEFAULT 'NFC')
+RETURNS text
+LANGUAGE internal
+STRICT IMMUTABLE PARALLEL SAFE
+AS 'unicode_normalize_func';
+
+CREATE OR REPLACE FUNCTION
+  is_normalized(text, text DEFAULT 'NFC')
+RETURNS boolean
+LANGUAGE internal
+STRICT IMMUTABLE PARALLEL SAFE
+AS 'unicode_is_normalized';
+
 --
 -- The default permissions for functions mean that anyone can execute them.
 -- A number of functions shouldn't be executable by just anyone, but rather
 
@@ -444,6 +444,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <list>	substr_list trim_list
 %type <list>	opt_interval interval_second
 %type <node>	overlay_placing substr_from substr_for
+%type <str>		unicode_normal_form
 
 %type <boolean> opt_instead
 %type <boolean> opt_unique opt_concurrently opt_verbose opt_full
@@ -664,7 +665,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 
 	MAPPING MATCH MATERIALIZED MAXVALUE METHOD MINUTE_P MINVALUE MODE MONTH_P MOVE
 
-	NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NO NONE
+	NAME_P NAMES NATIONAL NATURAL NCHAR NEW NEXT NFC NFD NFKC NFKD NO NONE
+	NORMALIZE NORMALIZED
 	NOT NOTHING NOTIFY NOTNULL NOWAIT NULL_P NULLIF
 	NULLS_P NUMERIC
 
@@ -13491,6 +13493,22 @@ a_expr:		c_expr									{ $$ = $1; }
 												 list_make1($1), @2),
 									 @2);
 				}
+			| a_expr IS NORMALIZED								%prec IS
+				{
+					$$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2);
+				}
+			| a_expr IS unicode_normal_form NORMALIZED			%prec IS
+				{
+					$$ = (Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($3, @3)), @2);
+				}
+			| a_expr IS NOT NORMALIZED							%prec IS
+				{
+					$$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make1($1), @2), @2);
+				}
+			| a_expr IS NOT unicode_normal_form NORMALIZED		%prec IS
+				{
+					$$ = makeNotExpr((Node *) makeFuncCall(SystemFuncName("is_normalized"), list_make2($1, makeStringConst($4, @4)), @2), @2);
+				}
 			| DEFAULT
 				{
 					/*
@@ -13934,6 +13952,14 @@ func_expr_common_subexpr:
 				{
 					$$ = (Node *) makeFuncCall(SystemFuncName("date_part"), $3, @1);
 				}
+			| NORMALIZE '(' a_expr ')'
+				{
+					$$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make1($3), @1);
+				}
+			| NORMALIZE '(' a_expr ',' unicode_normal_form ')'
+				{
+					$$ = (Node *) makeFuncCall(SystemFuncName("normalize"), list_make2($3, makeStringConst($5, @5)), @1);
+				}
 			| OVERLAY '(' overlay_list ')'
 				{
 					/* overlay(A PLACING B FROM C FOR D) is converted to
@@ -14569,6 +14595,13 @@ extract_arg:
 			| Sconst								{ $$ = $1; }
 		;
 
+unicode_normal_form:
+			NFC										{ $$ = "nfc"; }
+			| NFD									{ $$ = "nfd"; }
+			| NFKC									{ $$ = "nfkc"; }
+			| NFKD									{ $$ = "nfkd"; }
+		;
+
 /* OVERLAY() arguments
  * SQL99 defines the OVERLAY() function:
  * o overlay(text placing text from int for int)
@@ -15315,7 +15348,12 @@ unreserved_keyword:
 			| NAMES
 			| NEW
 			| NEXT
+			| NFC
+			| NFD
+			| NFKC
+			| NFKD
 			| NO
+			| NORMALIZED
 			| NOTHING
 			| NOTIFY
 			| NOWAIT
@@ -15494,6 +15532,7 @@ col_name_keyword:
 			| NATIONAL
 			| NCHAR
 			| NONE
+			| NORMALIZE
 			| NULLIF
 			| NUMERIC
 			| OUT_P
 
@@ -22,6 +22,7 @@
 #include "catalog/pg_type.h"
 #include "common/hashfn.h"
 #include "common/int.h"
+#include "common/unicode_norm.h"
 #include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
@@ -5976,3 +5977,152 @@ rest_of_char_same(const char *s1, const char *s2, int len)
 #include "levenshtein.c"
 #define LEVENSHTEIN_LESS_EQUAL
 #include "levenshtein.c"
+
+
+/*
+ * Unicode support
+ */
+
+static UnicodeNormalizationForm
+unicode_norm_form_from_string(const char *formstr)
+{
+	UnicodeNormalizationForm form = -1;
+
+	/*
+	 * Might as well check this while we're here.
+	 */
+	if (GetDatabaseEncoding() != PG_UTF8)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("Unicode normalization can only be performed if server encoding is UTF8")));
+
+	if (pg_strcasecmp(formstr, "NFC") == 0)
+		form = UNICODE_NFC;
+	else if (pg_strcasecmp(formstr, "NFD") == 0)
+		form = UNICODE_NFD;
+	else if (pg_strcasecmp(formstr, "NFKC") == 0)
+		form = UNICODE_NFKC;
+	else if (pg_strcasecmp(formstr, "NFKD") == 0)
+		form = UNICODE_NFKD;
+	else
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid normalization form: %s", formstr)));
+
+	return form;
+}
+
+Datum
+unicode_normalize_func(PG_FUNCTION_ARGS)
+{
+	text	   *input = PG_GETARG_TEXT_PP(0);
+	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
+	UnicodeNormalizationForm form;
+	int			size;
+	pg_wchar   *input_chars;
+	pg_wchar   *output_chars;
+	unsigned char *p;
+	text	   *result;
+	int			i;
+
+	form = unicode_norm_form_from_string(formstr);
+
+	/* convert to pg_wchar */
+	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+	input_chars = palloc((size + 1) * sizeof(pg_wchar));
+	p = (unsigned char *) VARDATA_ANY(input);
+	for (i = 0; i < size; i++)
+	{
+		input_chars[i] = utf8_to_unicode(p);
+		p += pg_utf_mblen(p);
+	}
+	input_chars[i] = (pg_wchar) '\0';
+	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+
+	/* action */
+	output_chars = unicode_normalize(form, input_chars);
+
+	/* convert back to UTF-8 string */
+	size = 0;
+	for (pg_wchar *wp = output_chars; *wp; wp++)
+	{
+		unsigned char buf[4];
+
+		unicode_to_utf8(*wp, buf);
+		size += pg_utf_mblen(buf);
+	}
+
+	result = palloc(size + VARHDRSZ);
+	SET_VARSIZE(result, size + VARHDRSZ);
+
+	p = (unsigned char *) VARDATA_ANY(result);
+	for (pg_wchar *wp = output_chars; *wp; wp++)
+	{
+		unicode_to_utf8(*wp, p);
+		p += pg_utf_mblen(p);
+	}
+	Assert((char *) p == (char *) result + size + VARHDRSZ);
+
+	PG_RETURN_TEXT_P(result);
+}
+
+/*
+ * Check whether the string is in the specified Unicode normalization form.
+ *
+ * This is done by convering the string to the specified normal form and then
+ * comparing that to the original string.  To speed that up, we also apply the
+ * "quick check" algorithm specified in UAX #15, which can give a yes or no
+ * answer for many strings by just scanning the string once.
+ *
+ * This function should generally be optimized for the case where the string
+ * is in fact normalized.  In that case, we'll end up looking at the entire
+ * string, so it's probably not worth doing any incremental conversion etc.
+ */
+Datum
+unicode_is_normalized(PG_FUNCTION_ARGS)
+{
+	text	   *input = PG_GETARG_TEXT_PP(0);
+	char	   *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
+	UnicodeNormalizationForm form;
+	int			size;
+	pg_wchar   *input_chars;
+	pg_wchar   *output_chars;
+	unsigned char *p;
+	int			i;
+	UnicodeNormalizationQC quickcheck;
+	int			output_size;
+	bool		result;
+
+	form = unicode_norm_form_from_string(formstr);
+
+	/* convert to pg_wchar */
+	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
+	input_chars = palloc((size + 1) * sizeof(pg_wchar));
+	p = (unsigned char *) VARDATA_ANY(input);
+	for (i = 0; i < size; i++)
+	{
+		input_chars[i] = utf8_to_unicode(p);
+		p += pg_utf_mblen(p);
+	}
+	input_chars[i] = (pg_wchar) '\0';
+	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+
+	/* quick check (see UAX #15) */
+	quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
+	if (quickcheck == UNICODE_NORM_QC_YES)
+		PG_RETURN_BOOL(true);
+	else if (quickcheck == UNICODE_NORM_QC_NO)
+		PG_RETURN_BOOL(false);
+
+	/* normalize and compare with original */
+	output_chars = unicode_normalize(form, input_chars);
+
+	output_size = 0;
+	for (pg_wchar *wp = output_chars; *wp; wp++)
+		output_size++;
+
+	result = (size == output_size) &&
+		(memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+
+	PG_RETURN_BOOL(result);
+}
@@ -3,5 +3,6 @@
 
 # Downloaded files
 /CompositionExclusions.txt
+/DerivedNormalizationProps.txt
 /NormalizationTest.txt
 /UnicodeData.txt
@@ -18,14 +18,14 @@ LIBS += $(PTHREAD_LIBS)
 # By default, do nothing.
 all:
 
-update-unicode: unicode_norm_table.h unicode_combining_table.h
+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h
 	$(MAKE) normalization-check
-	mv unicode_norm_table.h unicode_combining_table.h ../../../src/include/common/
+	mv $^ ../../../src/include/common/
 
 # These files are part of the Unicode Character Database. Download
 # them on demand.  The dependency on Makefile.global is for
 # UNICODE_VERSION.
-UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
+UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
 	$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
 
 # Generation of conversion tables used for string normalization with
@@ -36,6 +36,9 @@ unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt Composition
 unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
 	$(PERL) $^ >$@
 
+unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizationProps.txt
+	$(PERL) $^ >$@
+
 # Test suite
 normalization-check: norm_test
 	./norm_test