Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 3d59da9

Browse files
committed
unaccent: Make generate_unaccent_rules.py Python 3 compatible
Python 2 is still supported. Author: Hugh Ranalli <hugh@whtc.ca> Discussion: https://www.postgresql.org/message-id/CAAhbUMNyZ+PhNr_mQ=G161K0-hvbq13Tz2is9M3WK+yX9cQOCw@mail.gmail.com
1 parent d33faa2 commit 3d59da9

File tree

1 file changed

+24
-6
lines changed

1 file changed

+24
-6
lines changed

contrib/unaccent/generate_unaccent_rules.py

+24-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/python2
1+
#!/usr/bin/python
22
# -*- coding: utf-8 -*-
33
#
44
# This script builds unaccent.rules on standard output when given the
@@ -23,6 +23,24 @@
2323
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
2424
# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
2525

26+
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
27+
# The approach is to be Python3 compatible with Python2 "backports".
28+
from __future__ import print_function
29+
from __future__ import unicode_literals
30+
import codecs
31+
import sys
32+
33+
if sys.version_info[0] <= 2:
34+
# Encode stdout as UTF-8, so we can just print to it
35+
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
36+
37+
# Map Python 2's chr to unichr
38+
chr = unichr
39+
40+
# Python 2 and 3 compatible bytes call
41+
def bytes(source, encoding='ascii', errors='strict'):
42+
return source.encode(encoding=encoding, errors=errors)
43+
# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
2644

2745
import re
2846
import argparse
@@ -39,7 +57,7 @@
3957
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
4058

4159
def print_record(codepoint, letter):
42-
print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
60+
print (chr(codepoint) + "\t" + letter)
4361

4462
class Codepoint:
4563
def __init__(self, id, general_category, combining_ids):
@@ -116,7 +134,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
116134
charactersSet = set()
117135

118136
# RegEx to parse rules
119-
rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
137+
rulePattern = re.compile(r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
120138

121139
# construct tree from XML
122140
transliterationTree = ET.parse(latinAsciiFilePath)
@@ -134,7 +152,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
134152
# Group 3: plain "trg" char. Empty if group 4 is not.
135153
# Group 4: plain "trg" char between quotes. Empty if group 3 is not.
136154
if matches is not None:
137-
src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape')
155+
src = matches.group(1) if matches.group(1) is not None else bytes(matches.group(2), 'UTF-8').decode('unicode-escape')
138156
trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
139157

140158
# "'" and """ are escaped
@@ -195,10 +213,10 @@ def main(args):
195213
len(codepoint.combining_ids) > 1:
196214
if is_letter_with_marks(codepoint, table):
197215
charactersSet.add((codepoint.id,
198-
unichr(get_plain_letter(codepoint, table).id)))
216+
chr(get_plain_letter(codepoint, table).id)))
199217
elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
200218
charactersSet.add((codepoint.id,
201-
"".join(unichr(combining_codepoint.id)
219+
"".join(chr(combining_codepoint.id)
202220
for combining_codepoint \
203221
in get_plain_letters(codepoint, table))))
204222

0 commit comments

Comments
 (0)