1
- #!/usr/bin/python2
1
+ #!/usr/bin/python
2
2
# -*- coding: utf-8 -*-
3
3
#
4
4
# This script builds unaccent.rules on standard output when given the
23
23
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
24
24
# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
25
25
26
+ # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
27
+ # The approach is to be Python3 compatible with Python2 "backports".
28
+ from __future__ import print_function
29
+ from __future__ import unicode_literals
30
+ import codecs
31
+ import sys
32
+
33
+ if sys .version_info [0 ] <= 2 :
34
+ # Encode stdout as UTF-8, so we can just print to it
35
+ sys .stdout = codecs .getwriter ('utf8' )(sys .stdout )
36
+
37
+ # Map Python 2's chr to unichr
38
+ chr = unichr
39
+
40
+ # Python 2 and 3 compatible bytes call
41
+ def bytes (source , encoding = 'ascii' , errors = 'strict' ):
42
+ return source .encode (encoding = encoding , errors = errors )
43
+ # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
26
44
27
45
import re
28
46
import argparse
39
57
(0x0391 , 0x03a9 )) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
40
58
41
59
def print_record (codepoint , letter ):
42
- print (unichr (codepoint ) + "\t " + letter ). encode ( "UTF-8" )
60
+ print (chr (codepoint ) + "\t " + letter )
43
61
44
62
class Codepoint :
45
63
def __init__ (self , id , general_category , combining_ids ):
@@ -116,7 +134,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
116
134
charactersSet = set ()
117
135
118
136
# RegEx to parse rules
119
- rulePattern = re .compile (ur '^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;' )
137
+ rulePattern = re .compile (r '^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;' )
120
138
121
139
# construct tree from XML
122
140
transliterationTree = ET .parse (latinAsciiFilePath )
@@ -134,7 +152,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
134
152
# Group 3: plain "trg" char. Empty if group 4 is not.
135
153
# Group 4: plain "trg" char between quotes. Empty if group 3 is not.
136
154
if matches is not None :
137
- src = matches .group (1 ) if matches .group (1 ) is not None else matches .group (2 ).decode ('unicode-escape' )
155
+ src = matches .group (1 ) if matches .group (1 ) is not None else bytes ( matches .group (2 ), 'UTF-8' ).decode ('unicode-escape' )
138
156
trg = matches .group (3 ) if matches .group (3 ) is not None else matches .group (4 )
139
157
140
158
# "'" and """ are escaped
@@ -195,10 +213,10 @@ def main(args):
195
213
len (codepoint .combining_ids ) > 1 :
196
214
if is_letter_with_marks (codepoint , table ):
197
215
charactersSet .add ((codepoint .id ,
198
- unichr (get_plain_letter (codepoint , table ).id )))
216
+ chr (get_plain_letter (codepoint , table ).id )))
199
217
elif args .noLigaturesExpansion is False and is_ligature (codepoint , table ):
200
218
charactersSet .add ((codepoint .id ,
201
- "" .join (unichr (combining_codepoint .id )
219
+ "" .join (chr (combining_codepoint .id )
202
220
for combining_codepoint \
203
221
in get_plain_letters (codepoint , table ))))
204
222
0 commit comments