Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeff Davis2023-04-04 17:28:08 +0000
committerJeff Davis2023-04-04 17:38:58 +0000
commitea1db8ae70e5f4ceaae34dc9c06a07d59aaa022e (patch)
tree2900cfeae37b4c63d24221185bb05708885ef1c7 /src/backend
parentd3d53f955cf6ad755ba3682577e0f6fa10106438 (diff)
Canonicalize ICU locale names to language tags.
Convert to BCP47 language tags before storing in the catalog, except during binary upgrade or when the locale comes from an existing collation or template database. The resulting language tags can vary slightly between ICU versions. For instance, "@colBackwards=yes" is converted to "und-u-kb-true" in older versions of ICU, and to the simpler (but equivalent) "und-u-kb" in newer versions. The process of canonicalizing to a language tag also understands more input locale string formats than ucol_open(). For instance, "fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is ignored; effectively treating it the same as the locale "fr" and opening the wrong collator. Canonicalization properly interprets the language and region, resulting in the language tag "fr-CA", which can then be understood by ucol_open(). This commit fixes a problem in prior versions due to ucol_open() misinterpreting locale strings as described above. For instance, creating an ICU collation with locale "fr_CA.UTF-8" would store that string directly in the catalog, which would later be passed to (and misinterpreted by) ucol_open(). After this commit, the locale string will be canonicalized to language tag "fr-CA" in the catalog, which will be properly understood by ucol_open(). Because this fix affects the resulting collator, we cannot change the locale string stored in the catalog for existing databases or collations; otherwise we'd risk corrupting indexes. Therefore, only canonicalize locales for newly-created (not upgraded) collations/databases. For similar reasons, do not backport. Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com Reviewed-by: Peter Eisentraut
Diffstat (limited to 'src/backend')
-rw-r--r--src/backend/commands/collationcmds.c46
-rw-r--r--src/backend/commands/dbcommands.c20
-rw-r--r--src/backend/utils/adt/pg_locale.c85
3 files changed, 130 insertions, 21 deletions
diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index 45de78352c7..c91fe66d9b2 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -165,6 +165,11 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
else
colliculocale = NULL;
+ /*
+ * When the ICU locale comes from an existing collation, do not
+ * canonicalize to a language tag.
+ */
+
datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
if (!isnull)
collicurules = TextDatumGetCString(datum);
@@ -259,6 +264,25 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
errmsg("parameter \"locale\" must be specified")));
+ /*
+ * During binary upgrade, preserve the locale string. Otherwise,
+ * canonicalize to a language tag.
+ */
+ if (!IsBinaryUpgrade)
+ {
+ char *langtag = icu_language_tag(colliculocale,
+ icu_validation_level);
+
+ if (langtag && strcmp(colliculocale, langtag) != 0)
+ {
+ ereport(NOTICE,
+ (errmsg("using standard form \"%s\" for locale \"%s\"",
+ langtag, colliculocale)));
+
+ colliculocale = langtag;
+ }
+ }
+
icu_validate_locale(colliculocale);
}
@@ -570,26 +594,6 @@ cmpaliases(const void *a, const void *b)
#ifdef USE_ICU
/*
- * Get the ICU language tag for a locale name.
- * The result is a palloc'd string.
- */
-static char *
-get_icu_language_tag(const char *localename)
-{
- char buf[ULOC_FULLNAME_CAPACITY];
- UErrorCode status;
-
- status = U_ZERO_ERROR;
- uloc_toLanguageTag(localename, buf, sizeof(buf), true, &status);
- if (U_FAILURE(status))
- ereport(ERROR,
- (errmsg("could not convert locale name \"%s\" to language tag: %s",
- localename, u_errorName(status))));
-
- return pstrdup(buf);
-}
-
-/*
* Get a comment (specifically, the display name) for an ICU locale.
* The result is a palloc'd string, or NULL if we can't get a comment
* or find that it's not all ASCII. (We can *not* accept non-ASCII
@@ -950,7 +954,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
else
name = uloc_getAvailable(i);
- langtag = get_icu_language_tag(name);
+ langtag = icu_language_tag(name, ERROR);
/*
* Be paranoid about not allowing any non-ASCII strings into
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 24bcc5adfe8..2e242eeff24 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1058,6 +1058,26 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ICU locale must be specified")));
+ /*
+ * During binary upgrade, or when the locale came from the template
+ * database, preserve locale string. Otherwise, canonicalize to a
+ * language tag.
+ */
+ if (!IsBinaryUpgrade && dbiculocale != src_iculocale)
+ {
+ char *langtag = icu_language_tag(dbiculocale,
+ icu_validation_level);
+
+ if (langtag && strcmp(dbiculocale, langtag) != 0)
+ {
+ ereport(NOTICE,
+ (errmsg("using standard form \"%s\" for locale \"%s\"",
+ langtag, dbiculocale)));
+
+ dbiculocale = langtag;
+ }
+ }
+
icu_validate_locale(dbiculocale);
}
else
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 9497c20d123..06e73aa012f 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -2827,6 +2827,91 @@ icu_set_collation_attributes(UCollator *collator, const char *loc,
#endif
/*
+ * Return the BCP47 language tag representation of the requested locale.
+ *
+ * This function should be called before passing the string to ucol_open(),
+ * because conversion to a language tag also performs "level 2
+ * canonicalization". In addition to producing a consistent format, level 2
+ * canonicalization is able to more accurately interpret different input
+ * locale string formats, such as POSIX and .NET IDs.
+ */
+char *
+icu_language_tag(const char *loc_str, int elevel)
+{
+#ifdef USE_ICU
+ UErrorCode status;
+ char lang[ULOC_LANG_CAPACITY];
+ char *langtag;
+ size_t buflen = 32; /* arbitrary starting buffer size */
+ const bool strict = true;
+
+ status = U_ZERO_ERROR;
+ uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
+ if (U_FAILURE(status))
+ {
+ if (elevel > 0)
+ ereport(elevel,
+ (errmsg("could not get language from locale \"%s\": %s",
+ loc_str, u_errorName(status))));
+ return NULL;
+ }
+
+ /* C/POSIX locales aren't handled by uloc_getLanguageTag() */
+ if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
+ return pstrdup("en-US-u-va-posix");
+
+ /*
+ * A BCP47 language tag doesn't have a clearly-defined upper limit
+ * (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
+ * uloc_toLanguageTag() doesn't always return the ultimate length on the
+ * first call, necessitating a loop.
+ */
+ langtag = palloc(buflen);
+ while (true)
+ {
+ int32_t len;
+
+ status = U_ZERO_ERROR;
+ len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);
+
+ /*
+ * If the result fits in the buffer exactly (len == buflen),
+ * uloc_toLanguageTag() will return success without nul-terminating
+ * the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
+ * buflen and try again.
+ */
+ if ((status == U_BUFFER_OVERFLOW_ERROR ||
+ (U_SUCCESS(status) && len >= buflen)) &&
+ buflen < MaxAllocSize)
+ {
+ buflen = Min(buflen * 2, MaxAllocSize);
+ langtag = repalloc(langtag, buflen);
+ continue;
+ }
+
+ break;
+ }
+
+ if (U_FAILURE(status))
+ {
+ pfree(langtag);
+
+ if (elevel > 0)
+ ereport(elevel,
+ (errmsg("could not convert locale name \"%s\" to language tag: %s",
+ loc_str, u_errorName(status))));
+ return NULL;
+ }
+
+ return langtag;
+#else /* not USE_ICU */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("ICU is not supported in this build")));
+#endif /* not USE_ICU */
+}
+
+/*
* Perform best-effort check that the locale is a valid one.
*/
void