Be forgiving of variant spellings of locale names in pg_upgrade.

tglsfdc · tglsfdc · commit 58274728fb8e · 2014-01-30T19:07:06.000-05:00
Even though the server tries to canonicalize stored locale names, the
platform often doesn't cooperate, so it's entirely possible that one DB
thinks its locale is, say, "en_US.UTF-8" while the other has "en_US.utf8".
Rather than failing, we should try to allow this where it's clearly OK.

There is already pretty robust encoding lookup in encnames.c, so make
use of that to compare the encoding parts of the names.  The locale
identifier parts are just compared case-insensitively, which we were
already doing.  The major problem known to exist in the field is variant
encoding-name spellings, so hopefully this will be Good Enough.  If not,
we can try being even laxer.

Pavel Raiskup, reviewed by Rushabh Lathia
diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
@@ -9,13 +9,16 @@
 
 #include "postgres_fe.h"
 
+#include "mb/pg_wchar.h"
 #include "pg_upgrade.h"
 
 
 static void set_locale_and_encoding(ClusterInfo *cluster);
 static void check_new_cluster_is_empty(void);
 static void check_locale_and_encoding(ControlData *oldctrl,
 						  ControlData *newctrl);
+static bool equivalent_locale(const char *loca, const char *locb);
+static bool equivalent_encoding(const char *chara, const char *charb);
 static void check_is_super_user(ClusterInfo *cluster);
 static void check_for_prepared_transactions(ClusterInfo *cluster);
 static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster);
@@ -397,27 +400,80 @@ set_locale_and_encoding(ClusterInfo *cluster)
 /*
  * check_locale_and_encoding()
  *
- *	locale is not in pg_controldata in 8.4 and later so
- *	we probably had to get via a database query.
+ * Check that old and new locale and encoding match.  Even though the backend
+ * tries to canonicalize stored locale names, the platform often doesn't
+ * cooperate, so it's entirely possible that one DB thinks its locale is
+ * "en_US.UTF-8" while the other says "en_US.utf8".  Try to be forgiving.
  */
 static void
 check_locale_and_encoding(ControlData *oldctrl,
 						  ControlData *newctrl)
 {
-	/*
-	 * These are often defined with inconsistent case, so use pg_strcasecmp().
-	 * They also often use inconsistent hyphenation, which we cannot fix, e.g.
-	 * UTF-8 vs. UTF8, so at least we display the mismatching values.
-	 */
-	if (pg_strcasecmp(oldctrl->lc_collate, newctrl->lc_collate) != 0)
+	if (!equivalent_locale(oldctrl->lc_collate, newctrl->lc_collate))
 		pg_fatal("lc_collate cluster values do not match:  old \"%s\", new \"%s\"\n",
-			   oldctrl->lc_collate, newctrl->lc_collate);
-	if (pg_strcasecmp(oldctrl->lc_ctype, newctrl->lc_ctype) != 0)
+				 oldctrl->lc_collate, newctrl->lc_collate);
+	if (!equivalent_locale(oldctrl->lc_ctype, newctrl->lc_ctype))
 		pg_fatal("lc_ctype cluster values do not match:  old \"%s\", new \"%s\"\n",
-			   oldctrl->lc_ctype, newctrl->lc_ctype);
-	if (pg_strcasecmp(oldctrl->encoding, newctrl->encoding) != 0)
+				 oldctrl->lc_ctype, newctrl->lc_ctype);
+	if (!equivalent_encoding(oldctrl->encoding, newctrl->encoding))
 		pg_fatal("encoding cluster values do not match:  old \"%s\", new \"%s\"\n",
-			   oldctrl->encoding, newctrl->encoding);
+				 oldctrl->encoding, newctrl->encoding);
+}
+
+/*
+ * equivalent_locale()
+ *
+ * Best effort locale-name comparison.  Return false if we are not 100% sure
+ * the locales are equivalent.
+ */
+static bool
+equivalent_locale(const char *loca, const char *locb)
+{
+	const char *chara = strrchr(loca, '.');
+	const char *charb = strrchr(locb, '.');
+	int			lencmp;
+
+	/* If they don't both contain an encoding part, just do strcasecmp(). */
+	if (!chara || !charb)
+		return (pg_strcasecmp(loca, locb) == 0);
+
+	/* Compare the encoding parts. */
+	if (!equivalent_encoding(chara + 1, charb + 1))
+		return false;
+
+	/*
+	 * OK, compare the locale identifiers (e.g. en_US part of en_US.utf8).
+	 *
+	 * It's tempting to ignore non-alphanumeric chars here, but for now it's
+	 * not clear that that's necessary; just do case-insensitive comparison.
+	 */
+	lencmp = chara - loca;
+	if (lencmp != charb - locb)
+		return false;
+
+	return (pg_strncasecmp(loca, locb, lencmp) == 0);
+}
+
+/*
+ * equivalent_encoding()
+ *
+ * Best effort encoding-name comparison.  Return true only if the encodings
+ * are valid server-side encodings and known equivalent.
+ *
+ * Because the lookup in pg_valid_server_encoding() does case folding and
+ * ignores non-alphanumeric characters, this will recognize many popular
+ * variant spellings as equivalent, eg "utf8" and "UTF-8" will match.
+ */
+static bool
+equivalent_encoding(const char *chara, const char *charb)
+{
+	int			enca = pg_valid_server_encoding(chara);
+	int			encb = pg_valid_server_encoding(charb);
+
+	if (enca < 0 || encb < 0)
+		return false;
+
+	return (enca == encb);
 }