Update unicode.org URLs

petere · petere · commit bdb839cbdebe · 2019-10-13T22:10:38.000+02:00
Use https, consistent host name, remove references to ftp.  Also
update the URLs for CLDR, which has moved from Trac to GitHub.
diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py
@@ -24,9 +24,9 @@
 # Latin-ASCII.xml, the latest data sets released can be browsed directly
 # via [3].  Note that this script is compatible with at least release 29.
 #
-# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
-# [2] http://unicode.org/cldr/trac/export/14746/tags/release-34/common/transforms/Latin-ASCII.xml
-# [3] https://unicode.org/cldr/trac/browser/tags
+# [1] https://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt
+# [2] https://raw.githubusercontent.com/unicode-org/cldr/release-34/common/transforms/Latin-ASCII.xml
+# [3] https://github.com/unicode-org/cldr/tags
 
 # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
 # The approach is to be Python3 compatible with Python2 "backports".
@@ -113,7 +113,7 @@ def is_mark(codepoint):
 
 def is_letter_with_marks(codepoint, table):
     """Returns true for letters combined with one or more marks."""
-    # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+    # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
 
     # Letter may have no combining characters, in which case it has
     # no marks.
@@ -226,7 +226,7 @@ def special_cases():
     return charactersSet
 
 def main(args):
-    # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+    # https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
     decomposition_type_pattern = re.compile(" *<[^>]*> *")
 
     table = {}
@@ -243,7 +243,7 @@ def main(args):
         for line in unicodeDataFile:
             fields = line.split(";")
             if len(fields) > 5:
-                # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+                # https://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
                 general_category = fields[2]
                 decomposition = fields[5]
                 decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
@@ -281,8 +281,8 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
-    parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath')
-    parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath')
+    parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt.", type=str, required=True, dest='unicodeDataFilePath')
+    parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml).", type=str, dest='latinAsciiFilePath')
     parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
     args = parser.parse_args()
 
diff --git a/doc/src/sgml/acronyms.sgml b/doc/src/sgml/acronyms.sgml
@@ -728,7 +728,7 @@
     <term><acronym>UTF</acronym></term>
     <listitem>
      <para>
-      <ulink url="http://www.unicode.org/">Unicode Transformation
+      <ulink url="https://www.unicode.org/">Unicode Transformation
       Format</ulink>
      </para>
     </listitem>
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
@@ -832,12 +832,12 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE');
      </varlistentry>
     </variablelist>
 
-    See <ulink url="http://unicode.org/reports/tr35/tr35-collation.html">Unicode
+    See <ulink url="https://www.unicode.org/reports/tr35/tr35-collation.html">Unicode
     Technical Standard #35</ulink>
     and <ulink url="https://tools.ietf.org/html/bcp47">BCP 47</ulink> for
     details.  The list of possible collation types (<literal>co</literal>
     subtag) can be found in
-    the <ulink url="http://www.unicode.org/repos/cldr/trunk/common/bcp47/collation.xml">CLDR
+    the <ulink url="https://github.com/unicode-org/cldr/blob/master/common/bcp47/collation.xml">CLDR
     repository</ulink>.
     The <ulink url="https://ssl.icu-project.org/icu-bin/locexp">ICU Locale
     Explorer</ulink> can be used to check the details of a particular locale
@@ -900,7 +900,7 @@ CREATE COLLATION french FROM "fr-x-icu";
      different Unicode normal forms.  It is up to the collation provider to
      actually implement such insensitive comparisons; the deterministic flag
      only determines whether ties are to be broken using bytewise comparison.
-     See also <ulink url="https://unicode.org/reports/tr10">Unicode Technical
+     See also <ulink url="https://www.unicode.org/reports/tr10">Unicode Technical
      Standard 10</ulink> for more information on the terminology.
     </para>
 
@@ -1926,7 +1926,7 @@ RESET client_encoding;
       </varlistentry>
 
       <varlistentry>
-       <term><ulink url="http://www.unicode.org/"></ulink></term>
+       <term><ulink url="https://www.unicode.org/"></ulink></term>
 
        <listitem>
         <para>
diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile
@@ -119,7 +119,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps
 #DOWNLOAD = curl -o $@
 
 BIG5.TXT CNS11643.TXT:
-	$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)
+	$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/$(@F)
 
 euc-jis-2004-std.txt sjis-0213-2004-std.txt:
 	$(DOWNLOAD) http://x0213.org/codetable/$(@F)
@@ -131,19 +131,19 @@ GB2312.TXT:
 	$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
 
 JIS0212.TXT:
-	$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F)
+	$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/$(@F)
 
 JOHAB.TXT KSX1001.TXT:
-	$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F)
+	$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/$(@F)
 
 KOI8-R.TXT KOI8-U.TXT:
-	$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F)
+	$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/$(@F)
 
 $(ISO8859TEXTS):
-	$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
+	$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/ISO8859/$(@F)
 
 $(filter-out CP8%,$(WINTEXTS)) CP932.TXT CP950.TXT:
-	$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
+	$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/$(@F)
 
 $(filter CP8%,$(WINTEXTS)):
-	$(DOWNLOAD) http://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F)
+	$(DOWNLOAD) https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/$(@F)
diff --git a/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl b/src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
@@ -8,8 +8,8 @@
 # map files provided by Unicode organization.
 # Unfortunately it is prohibited by the organization
 # to distribute the map files. So if you try to use this script,
-# you have to obtain the map files from the organization's ftp site.
-# ftp://www.unicode.org/Public/MAPPINGS/
+# you have to obtain the map files from the organization's download site.
+# https://www.unicode.org/Public/MAPPINGS/
 #
 # Our "big5" comes from BIG5.TXT, with the addition of the characters
 # in the range 0xf9d6-0xf9dc from CP950.TXT.
diff --git a/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl b/src/backend/utils/mb/Unicode/UCS_to_JOHAB.pl
@@ -8,8 +8,8 @@
 # map files provided by Unicode organization.
 # Unfortunately it is prohibited by the organization
 # to distribute the map files. So if you try to use this script,
-# you have to obtain the map files from the organization's ftp site.
-# ftp://www.unicode.org/Public/MAPPINGS/
+# you have to obtain the map files from the organization's download site.
+# https://www.unicode.org/Public/MAPPINGS/
 # We assume the file include three tab-separated columns:
 #		 JOHAB code in hex
 #		 UCS-2 code in hex
diff --git a/src/backend/utils/mb/Unicode/UCS_to_most.pl b/src/backend/utils/mb/Unicode/UCS_to_most.pl
@@ -8,8 +8,8 @@
 # map files provided by Unicode organization.
 # Unfortunately it is prohibited by the organization
 # to distribute the map files. So if you try to use this script,
-# you have to obtain the map files from the organization's ftp site.
-# ftp://www.unicode.org/Public/MAPPINGS/
+# you have to obtain the map files from the organization's download site.
+# https://www.unicode.org/Public/MAPPINGS/
 # We assume the file include three tab-separated columns:
 #		 source character set code in hex
 #		 UCS-2 code in hex
diff --git a/src/common/unicode/Makefile b/src/common/unicode/Makefile
@@ -23,7 +23,7 @@ DOWNLOAD = wget -O $@ --no-use-server-timestamps
 # These files are part of the Unicode Character Database. Download
 # them on demand.
 UnicodeData.txt CompositionExclusions.txt NormalizationTest.txt:
-	$(DOWNLOAD) http://unicode.org/Public/UNIDATA/$(@F)
+	$(DOWNLOAD) https://www.unicode.org/Public/UNIDATA/$(@F)
 
 # Generation of conversion tables used for string normalization with
 # UTF-8 strings.
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
@@ -3,7 +3,7 @@
  *		Normalize a Unicode string to NFKC form
  *
  * This implements Unicode normalization, per the documentation at
- * http://www.unicode.org/reports/tr15/.
+ * https://www.unicode.org/reports/tr15/.
  *
  * Portions Copyright (c) 2017-2019, PostgreSQL Global Development Group
  *
@@ -109,7 +109,7 @@ get_decomposed_size(pg_wchar code)
 	/*
 	 * Fast path for Hangul characters not stored in tables to save memory as
 	 * decomposition is algorithmic. See
-	 * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
+	 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
 	 * the matter.
 	 */
 	if (code >= SBASE && code < SBASE + SCOUNT)
@@ -234,7 +234,7 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
 	/*
 	 * Fast path for Hangul characters not stored in tables to save memory as
 	 * decomposition is algorithmic. See
-	 * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
+	 * https://www.unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
 	 * the matter.
 	 */
 	if (code >= SBASE && code < SBASE + SCOUNT)
@@ -362,7 +362,7 @@ unicode_normalize_kc(const pg_wchar *input)
 			continue;
 
 		/*
-		 * Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
+		 * Per Unicode (https://www.unicode.org/reports/tr15/tr15-18.html) annex 4,
 		 * a sequence of two adjacent characters in a string is an
 		 * exchangeable pair if the combining class (from the Unicode
 		 * Character Database) for the first character is greater than the