diff options
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/utils/adt/formatting.c | 69 | ||||
-rw-r--r-- | src/backend/utils/adt/oracle_compat.c | 16 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale.c | 24 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale_builtin.c | 10 | ||||
-rw-r--r-- | src/backend/utils/adt/pg_locale_icu.c | 58 |
5 files changed, 177 insertions, 0 deletions
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c index 7c4c4aa07d5..2720d3902ab 100644 --- a/src/backend/utils/adt/formatting.c +++ b/src/backend/utils/adt/formatting.c @@ -1820,6 +1820,75 @@ str_initcap(const char *buff, size_t nbytes, Oid collid) } /* + * collation-aware, wide-character-aware case folding + * + * We pass the number of bytes so we can pass varlena and char* + * to this function. The result is a palloc'd, null-terminated string. + */ +char * +str_casefold(const char *buff, size_t nbytes, Oid collid) +{ + char *result; + pg_locale_t mylocale; + + if (!buff) + return NULL; + + if (!OidIsValid(collid)) + { + /* + * This typically means that the parser could not resolve a conflict + * of implicit collations, so report it that way. + */ + ereport(ERROR, + (errcode(ERRCODE_INDETERMINATE_COLLATION), + errmsg("could not determine which collation to use for %s function", + "lower()"), + errhint("Use the COLLATE clause to set the collation explicitly."))); + } + + if (GetDatabaseEncoding() != PG_UTF8) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("Unicode case folding can only be performed if server encoding is UTF8"))); + + mylocale = pg_newlocale_from_collation(collid); + + /* C/POSIX collations use this path regardless of database encoding */ + if (mylocale->ctype_is_c) + { + result = asc_tolower(buff, nbytes); + } + else + { + const char *src = buff; + size_t srclen = nbytes; + size_t dstsize; + char *dst; + size_t needed; + + /* first try buffer of equal size plus terminating NUL */ + dstsize = srclen + 1; + dst = palloc(dstsize); + + needed = pg_strfold(dst, dstsize, src, srclen, mylocale); + if (needed + 1 > dstsize) + { + /* grow buffer if needed and retry */ + dstsize = needed + 1; + dst = repalloc(dst, dstsize); + needed = pg_strfold(dst, dstsize, src, srclen, mylocale); + Assert(needed + 1 <= dstsize); + } + + Assert(dst[needed] == '\0'); + result = dst; + } + + return result; +} + +/* * ASCII-only lower function * * We pass the number of bytes so we can pass varlena and char* diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c index 2cba7cd1621..a24a2d208fb 100644 --- a/src/backend/utils/adt/oracle_compat.c +++ b/src/backend/utils/adt/oracle_compat.c @@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } +Datum +casefold(PG_FUNCTION_ARGS) +{ + text *in_string = PG_GETARG_TEXT_PP(0); + char *out_string; + text *result; + + out_string = str_casefold(VARDATA_ANY(in_string), + VARSIZE_ANY_EXHDR(in_string), + PG_GET_COLLATION()); + result = cstring_to_text(out_string); + pfree(out_string); + + PG_RETURN_TEXT_P(result); +} + /******************************************************************** * diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 94444acd2c5..7d92f580a57 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -106,6 +106,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); extern size_t strlower_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -113,6 +115,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); extern size_t strlower_libc(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); @@ -1447,6 +1451,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, return 0; /* keep compiler quiet */ } +size_t +pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + if (locale->provider == COLLPROVIDER_BUILTIN) + return strfold_builtin(dst, dstsize, src, srclen, locale); +#ifdef USE_ICU + else if (locale->provider == COLLPROVIDER_ICU) + return strfold_icu(dst, dstsize, src, srclen, locale); +#endif + /* for libc, just use strlower */ + else if (locale->provider == COLLPROVIDER_LIBC) + return strlower_libc(dst, dstsize, src, srclen, locale); + else + /* shouldn't happen */ + PGLOCALE_SUPPORT_ERROR(locale->provider); + + return 0; /* keep compiler quiet */ +} + /* * pg_strcoll * diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 436e32c0ca0..33ad20bbf07 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -31,6 +31,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); struct WordBoundaryState @@ -107,6 +109,14 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, locale->info.builtin.casemap_full); } +size_t +strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + return unicode_strfold(dest, destsize, src, srclen, + locale->info.builtin.casemap_full); +} + pg_locale_t create_pg_locale_builtin(Oid collid, MemoryContext context) { diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 5185b0f7289..b0c73f2e43d 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); extern size_t strupper_icu(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t strfold_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); #ifdef USE_ICU @@ -117,6 +119,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, UErrorCode *pErrorCode); +static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode); static const struct collate_methods collate_methods_icu = { .strncoll = strncoll_icu, @@ -439,6 +445,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_len; } +size_t +strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + int32_t len_uchar; + int32_t len_conv; + UChar *buff_uchar; + UChar *buff_conv; + size_t result_len; + + len_uchar = icu_to_uchar(&buff_uchar, src, srclen); + len_conv = icu_convert_case(u_strFoldCase_default, locale, + &buff_conv, buff_uchar, len_uchar); + result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv); + pfree(buff_uchar); + pfree(buff_conv); + + return result_len; +} + /* * strncoll_icu_utf8 * @@ -673,6 +699,38 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, NULL, locale, pErrorCode); } +static int32_t +u_strFoldCase_default(UChar *dest, int32_t destCapacity, + const UChar *src, int32_t srcLength, + const char *locale, + UErrorCode *pErrorCode) +{ + uint32 options = U_FOLD_CASE_DEFAULT; + char lang[3]; + UErrorCode status; + + /* + * Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case + * folding does not accept a locale. Instead it just supports a single + * option relevant to Turkic languages 'az' and 'tr'; check for those + * languages to enable the option. + */ + status = U_ZERO_ERROR; + uloc_getLanguage(locale, lang, 3, &status); + if (U_SUCCESS(status)) + { + /* + * The option name is confusing, but it causes u_strFoldCase to use + * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT. + */ + if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0) + options = U_FOLD_CASE_EXCLUDE_SPECIAL_I; + } + + return u_strFoldCase(dest, destCapacity, src, srcLength, + options, pErrorCode); +} + /* * strncoll_icu * |