|
22 | 22 | #include "catalog/pg_type.h"
|
23 | 23 | #include "common/hashfn.h"
|
24 | 24 | #include "common/int.h"
|
| 25 | +#include "common/unicode_norm.h" |
25 | 26 | #include "lib/hyperloglog.h"
|
26 | 27 | #include "libpq/pqformat.h"
|
27 | 28 | #include "miscadmin.h"
|
@@ -5976,3 +5977,152 @@ rest_of_char_same(const char *s1, const char *s2, int len)
|
5976 | 5977 | #include "levenshtein.c"
|
5977 | 5978 | #define LEVENSHTEIN_LESS_EQUAL
|
5978 | 5979 | #include "levenshtein.c"
|
| 5980 | + |
| 5981 | + |
| 5982 | +/* |
| 5983 | + * Unicode support |
| 5984 | + */ |
| 5985 | + |
| 5986 | +static UnicodeNormalizationForm |
| 5987 | +unicode_norm_form_from_string(const char *formstr) |
| 5988 | +{ |
| 5989 | + UnicodeNormalizationForm form = -1; |
| 5990 | + |
| 5991 | + /* |
| 5992 | + * Might as well check this while we're here. |
| 5993 | + */ |
| 5994 | + if (GetDatabaseEncoding() != PG_UTF8) |
| 5995 | + ereport(ERROR, |
| 5996 | + (errcode(ERRCODE_SYNTAX_ERROR), |
| 5997 | + errmsg("Unicode normalization can only be performed if server encoding is UTF8"))); |
| 5998 | + |
| 5999 | + if (pg_strcasecmp(formstr, "NFC") == 0) |
| 6000 | + form = UNICODE_NFC; |
| 6001 | + else if (pg_strcasecmp(formstr, "NFD") == 0) |
| 6002 | + form = UNICODE_NFD; |
| 6003 | + else if (pg_strcasecmp(formstr, "NFKC") == 0) |
| 6004 | + form = UNICODE_NFKC; |
| 6005 | + else if (pg_strcasecmp(formstr, "NFKD") == 0) |
| 6006 | + form = UNICODE_NFKD; |
| 6007 | + else |
| 6008 | + ereport(ERROR, |
| 6009 | + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
| 6010 | + errmsg("invalid normalization form: %s", formstr))); |
| 6011 | + |
| 6012 | + return form; |
| 6013 | +} |
| 6014 | + |
| 6015 | +Datum |
| 6016 | +unicode_normalize_func(PG_FUNCTION_ARGS) |
| 6017 | +{ |
| 6018 | + text *input = PG_GETARG_TEXT_PP(0); |
| 6019 | + char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); |
| 6020 | + UnicodeNormalizationForm form; |
| 6021 | + int size; |
| 6022 | + pg_wchar *input_chars; |
| 6023 | + pg_wchar *output_chars; |
| 6024 | + unsigned char *p; |
| 6025 | + text *result; |
| 6026 | + int i; |
| 6027 | + |
| 6028 | + form = unicode_norm_form_from_string(formstr); |
| 6029 | + |
| 6030 | + /* convert to pg_wchar */ |
| 6031 | + size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); |
| 6032 | + input_chars = palloc((size + 1) * sizeof(pg_wchar)); |
| 6033 | + p = (unsigned char *) VARDATA_ANY(input); |
| 6034 | + for (i = 0; i < size; i++) |
| 6035 | + { |
| 6036 | + input_chars[i] = utf8_to_unicode(p); |
| 6037 | + p += pg_utf_mblen(p); |
| 6038 | + } |
| 6039 | + input_chars[i] = (pg_wchar) '\0'; |
| 6040 | + Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); |
| 6041 | + |
| 6042 | + /* action */ |
| 6043 | + output_chars = unicode_normalize(form, input_chars); |
| 6044 | + |
| 6045 | + /* convert back to UTF-8 string */ |
| 6046 | + size = 0; |
| 6047 | + for (pg_wchar *wp = output_chars; *wp; wp++) |
| 6048 | + { |
| 6049 | + unsigned char buf[4]; |
| 6050 | + |
| 6051 | + unicode_to_utf8(*wp, buf); |
| 6052 | + size += pg_utf_mblen(buf); |
| 6053 | + } |
| 6054 | + |
| 6055 | + result = palloc(size + VARHDRSZ); |
| 6056 | + SET_VARSIZE(result, size + VARHDRSZ); |
| 6057 | + |
| 6058 | + p = (unsigned char *) VARDATA_ANY(result); |
| 6059 | + for (pg_wchar *wp = output_chars; *wp; wp++) |
| 6060 | + { |
| 6061 | + unicode_to_utf8(*wp, p); |
| 6062 | + p += pg_utf_mblen(p); |
| 6063 | + } |
| 6064 | + Assert((char *) p == (char *) result + size + VARHDRSZ); |
| 6065 | + |
| 6066 | + PG_RETURN_TEXT_P(result); |
| 6067 | +} |
| 6068 | + |
| 6069 | +/* |
| 6070 | + * Check whether the string is in the specified Unicode normalization form. |
| 6071 | + * |
| 6072 | + * This is done by convering the string to the specified normal form and then |
| 6073 | + * comparing that to the original string. To speed that up, we also apply the |
| 6074 | + * "quick check" algorithm specified in UAX #15, which can give a yes or no |
| 6075 | + * answer for many strings by just scanning the string once. |
| 6076 | + * |
| 6077 | + * This function should generally be optimized for the case where the string |
| 6078 | + * is in fact normalized. In that case, we'll end up looking at the entire |
| 6079 | + * string, so it's probably not worth doing any incremental conversion etc. |
| 6080 | + */ |
| 6081 | +Datum |
| 6082 | +unicode_is_normalized(PG_FUNCTION_ARGS) |
| 6083 | +{ |
| 6084 | + text *input = PG_GETARG_TEXT_PP(0); |
| 6085 | + char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1)); |
| 6086 | + UnicodeNormalizationForm form; |
| 6087 | + int size; |
| 6088 | + pg_wchar *input_chars; |
| 6089 | + pg_wchar *output_chars; |
| 6090 | + unsigned char *p; |
| 6091 | + int i; |
| 6092 | + UnicodeNormalizationQC quickcheck; |
| 6093 | + int output_size; |
| 6094 | + bool result; |
| 6095 | + |
| 6096 | + form = unicode_norm_form_from_string(formstr); |
| 6097 | + |
| 6098 | + /* convert to pg_wchar */ |
| 6099 | + size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input)); |
| 6100 | + input_chars = palloc((size + 1) * sizeof(pg_wchar)); |
| 6101 | + p = (unsigned char *) VARDATA_ANY(input); |
| 6102 | + for (i = 0; i < size; i++) |
| 6103 | + { |
| 6104 | + input_chars[i] = utf8_to_unicode(p); |
| 6105 | + p += pg_utf_mblen(p); |
| 6106 | + } |
| 6107 | + input_chars[i] = (pg_wchar) '\0'; |
| 6108 | + Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input)); |
| 6109 | + |
| 6110 | + /* quick check (see UAX #15) */ |
| 6111 | + quickcheck = unicode_is_normalized_quickcheck(form, input_chars); |
| 6112 | + if (quickcheck == UNICODE_NORM_QC_YES) |
| 6113 | + PG_RETURN_BOOL(true); |
| 6114 | + else if (quickcheck == UNICODE_NORM_QC_NO) |
| 6115 | + PG_RETURN_BOOL(false); |
| 6116 | + |
| 6117 | + /* normalize and compare with original */ |
| 6118 | + output_chars = unicode_normalize(form, input_chars); |
| 6119 | + |
| 6120 | + output_size = 0; |
| 6121 | + for (pg_wchar *wp = output_chars; *wp; wp++) |
| 6122 | + output_size++; |
| 6123 | + |
| 6124 | + result = (size == output_size) && |
| 6125 | + (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0); |
| 6126 | + |
| 6127 | + PG_RETURN_BOOL(result); |
| 6128 | +} |
0 commit comments