1
- #!/usr/bin/python
1
+ #!/usr/bin/python2
2
+ # -*- coding: utf-8 -*-
2
3
#
3
4
# This script builds unaccent.rules on standard output when given the
4
- # contents of UnicodeData.txt[1] on standard input. Optionally includes
5
- # ligature expansion, if --expand-ligatures is given on the command line.
5
+ # contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
6
+ # arguments. Optionally includes ligature expansion and Unicode CLDR
7
+ # Latin-ASCII transliterator, enabled by default, this can be disabled
8
+ # with "--no-ligatures-expansion" command line option.
6
9
#
7
10
# The approach is to use the Unicode decomposition data to identify
8
11
# precomposed codepoints that are equivalent to a ligature of several
9
12
# letters, or a base letter with any number of diacritical marks.
10
- # There is also a small set of special cases for codepoints that we
11
- # traditionally support even though Unicode doesn't consider them to
12
- # be ligatures or letters with marks.
13
13
#
14
- # [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
14
+ # This approach handles most letters with diacritical marks and some
15
+ # ligatures. However, several characters (notably a majority of
16
+ # ligatures) don't have decomposition. To handle all these cases, one can
17
+ # use a standard Unicode transliterator available in Common Locale Data
18
+ # Repository (CLDR): Latin-ASCII. This transliterator associates Unicode
19
+ # characters to ASCII-range equivalent. Unless "--no-ligatures-expansion"
20
+ # option is enabled, the XML file of this transliterator [2] -- given as a
21
+ # command line argument -- will be parsed and used.
22
+ #
23
+ # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
24
+ # [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
25
+
15
26
16
27
import re
28
+ import argparse
17
29
import sys
30
+ import xml .etree .ElementTree as ET
18
31
19
32
def print_record (codepoint , letter ):
20
33
print (unichr (codepoint ) + "\t " + letter ).encode ("UTF-8" )
@@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table):
63
76
assert (is_ligature (codepoint , table ))
64
77
return [get_plain_letter (table [id ], table ) for id in codepoint .combining_ids ]
65
78
66
- def main (expand_ligatures ):
79
+ def parse_cldr_latin_ascii_transliterator (latinAsciiFilePath ):
80
+ """Parse the XML file and return a set of tuples (src, trg), where "src"
81
+ is the original character and "trg" the substitute."""
82
+ charactersSet = set ()
83
+
84
+ # RegEx to parse rules
85
+ rulePattern = re .compile (ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;' )
86
+
87
+ # construct tree from XML
88
+ transliterationTree = ET .parse (latinAsciiFilePath )
89
+ transliterationTreeRoot = transliterationTree .getroot ()
90
+
91
+ for rule in transliterationTreeRoot .findall ("./transforms/transform/tRule" ):
92
+ matches = rulePattern .search (rule .text )
93
+
94
+ # The regular expression capture four groups corresponding
95
+ # to the characters.
96
+ #
97
+ # Group 1: plain "src" char. Empty if group 2 is not.
98
+ # Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not.
99
+ #
100
+ # Group 3: plain "trg" char. Empty if group 4 is not.
101
+ # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
102
+ if matches is not None :
103
+ src = matches .group (1 ) if matches .group (1 ) is not None else matches .group (2 ).decode ('unicode-escape' )
104
+ trg = matches .group (3 ) if matches .group (3 ) is not None else matches .group (4 )
105
+
106
+ # "'" and """ are escaped
107
+ trg = trg .replace ("\\ '" , "'" ).replace ('\\ "' , '"' )
108
+
109
+ # the parser of unaccent only accepts non-whitespace characters
110
+ # for "src" and "trg" (see unaccent.c)
111
+ if not src .isspace () and not trg .isspace ():
112
+ charactersSet .add ((ord (src ), trg ))
113
+
114
+ return charactersSet
115
+
116
+ def special_cases ():
117
+ """Returns the special cases which are not handled by other methods"""
118
+ charactersSet = set ()
119
+
120
+ # Cyrillic
121
+ charactersSet .add ((0x0401 , u"\u0415 " )) # CYRILLIC CAPITAL LETTER IO
122
+ charactersSet .add ((0x0451 , u"\u0435 " )) # CYRILLIC SMALL LETTER IO
123
+
124
+ # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
125
+ charactersSet .add ((0x2103 , u"\xb0 C" )) # DEGREE CELSIUS
126
+ charactersSet .add ((0x2109 , u"\xb0 F" )) # DEGREE FAHRENHEIT
127
+ charactersSet .add ((0x2117 , "(P)" )) # SOUND RECORDING COPYRIGHT
128
+
129
+ return charactersSet
130
+
131
+ def main (args ):
67
132
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
68
133
decomposition_type_pattern = re .compile (" *<[^>]*> *" )
69
134
70
135
table = {}
71
136
all = []
72
137
138
+ # unordered set for ensure uniqueness
139
+ charactersSet = set ()
140
+
141
+ # read file UnicodeData.txt
142
+ unicodeDataFile = open (args .unicodeDataFilePath , 'r' )
143
+
73
144
# read everything we need into memory
74
- for line in sys . stdin . readlines () :
145
+ for line in unicodeDataFile :
75
146
fields = line .split (";" )
76
147
if len (fields ) > 5 :
77
148
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
@@ -89,35 +160,34 @@ def main(expand_ligatures):
89
160
if codepoint .general_category .startswith ('L' ) and \
90
161
len (codepoint .combining_ids ) > 1 :
91
162
if is_letter_with_marks (codepoint , table ):
92
- print_record (codepoint .id ,
93
- chr (get_plain_letter (codepoint , table ).id ))
94
- elif expand_ligatures and is_ligature (codepoint , table ):
95
- print_record (codepoint .id ,
163
+ charactersSet . add ( (codepoint .id ,
164
+ chr (get_plain_letter (codepoint , table ).id )))
165
+ elif args . noLigaturesExpansion is False and is_ligature (codepoint , table ):
166
+ charactersSet . add ( (codepoint .id ,
96
167
"" .join (unichr (combining_codepoint .id )
97
168
for combining_codepoint \
98
- in get_plain_letters (codepoint , table )))
99
-
100
- # some special cases
101
- print_record (0x00d8 , "O" ) # LATIN CAPITAL LETTER O WITH STROKE
102
- print_record (0x00f8 , "o" ) # LATIN SMALL LETTER O WITH STROKE
103
- print_record (0x0110 , "D" ) # LATIN CAPITAL LETTER D WITH STROKE
104
- print_record (0x0111 , "d" ) # LATIN SMALL LETTER D WITH STROKE
105
- print_record (0x0131 , "i" ) # LATIN SMALL LETTER DOTLESS I
106
- print_record (0x0126 , "H" ) # LATIN CAPITAL LETTER H WITH STROKE
107
- print_record (0x0127 , "h" ) # LATIN SMALL LETTER H WITH STROKE
108
- print_record (0x0141 , "L" ) # LATIN CAPITAL LETTER L WITH STROKE
109
- print_record (0x0142 , "l" ) # LATIN SMALL LETTER L WITH STROKE
110
- print_record (0x0149 , "'n" ) # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
111
- print_record (0x0166 , "T" ) # LATIN CAPITAL LETTER T WITH STROKE
112
- print_record (0x0167 , "t" ) # LATIN SMALL LETTER t WITH STROKE
113
- print_record (0x0401 , u"\u0415 " ) # CYRILLIC CAPITAL LETTER IO
114
- print_record (0x0451 , u"\u0435 " ) # CYRILLIC SMALL LETTER IO
115
- if expand_ligatures :
116
- print_record (0x00c6 , "AE" ) # LATIN CAPITAL LETTER AE
117
- print_record (0x00df , "ss" ) # LATIN SMALL LETTER SHARP S
118
- print_record (0x00e6 , "ae" ) # LATIN SMALL LETTER AE
119
- print_record (0x0152 , "OE" ) # LATIN CAPITAL LIGATURE OE
120
- print_record (0x0153 , "oe" ) # LATIN SMALL LIGATURE OE
169
+ in get_plain_letters (codepoint , table ))))
170
+
171
+ # add CLDR Latin-ASCII characters
172
+ if not args .noLigaturesExpansion :
173
+ charactersSet |= parse_cldr_latin_ascii_transliterator (args .latinAsciiFilePath )
174
+ charactersSet |= special_cases ()
175
+
176
+ # sort for more convenient display
177
+ charactersList = sorted (charactersSet , key = lambda characterPair : characterPair [0 ])
178
+
179
+ for characterPair in charactersList :
180
+ print_record (characterPair [0 ], characterPair [1 ])
121
181
122
182
if __name__ == "__main__" :
123
- main (len (sys .argv ) == 2 and sys .argv [1 ] == "--expand-ligatures" )
183
+ parser = argparse .ArgumentParser (description = 'This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.' )
184
+ parser .add_argument ("--unicode-data-file" , help = "Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>." , type = str , required = True , dest = 'unicodeDataFilePath' )
185
+ parser .add_argument ("--latin-ascii-file" , help = "Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>." , type = str , dest = 'latinAsciiFilePath' )
186
+ parser .add_argument ("--no-ligatures-expansion" , help = "Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \" --latin-ascii-file\" argument is required. If this option is enabled, \" --latin-ascii-file\" argument is optional and ignored." , action = "store_true" , dest = 'noLigaturesExpansion' )
187
+ args = parser .parse_args ()
188
+
189
+ if args .noLigaturesExpansion is False and args .latinAsciiFilePath is None :
190
+ sys .stderr .write ('You must specify the path to Latin-ASCII transliterator file with \" --latin-ascii-file\" option or use \" --no-ligatures-expansion\" option. Use \" -h\" option for help.' )
191
+ sys .exit (1 )
192
+
193
+ main (args )
0 commit comments