Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 1bbd52c

Browse files
committed
Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly
Add Python script for buiding unaccent.rules from Unicode data. Don't backpatch because unaccent changes may require tsvector/index rebuild. Thomas Munro <thomas.munro@enterprisedb.com>
1 parent 4aec498 commit 1bbd52c

File tree

2 files changed

+415
-66
lines changed

2 files changed

+415
-66
lines changed
+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/usr/bin/python
2+
#
3+
# This script builds unaccent.rules on standard output when given the
4+
# contents of UnicodeData.txt[1] on standard input. Optionally includes
5+
# ligature expansion, if --expand-ligatures is given on the command line.
6+
#
7+
# The approach is to use the Unicode decomposition data to identify
8+
# precomposed codepoints that are equivalent to a ligature of several
9+
# letters, or a base letter with any number of diacritical marks.
10+
# There is also a small set of special cases for codepoints that we
11+
# traditionally support even though Unicode doesn't consider them to
12+
# be ligatures or letters with marks.
13+
#
14+
# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
15+
16+
import re
17+
import sys
18+
19+
def print_record(codepoint, letter):
20+
print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
21+
22+
class Codepoint:
23+
def __init__(self, id, general_category, combining_ids):
24+
self.id = id
25+
self.general_category = general_category
26+
self.combining_ids = combining_ids
27+
28+
def is_plain_letter(codepoint):
29+
"""Return true if codepoint represents a plain ASCII letter."""
30+
return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
31+
(codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
32+
33+
def is_mark(codepoint):
34+
"""Returns true for diacritical marks (combining codepoints)."""
35+
return codepoint.general_category in ("Mn", "Me", "Mc")
36+
37+
def is_letter_with_marks(codepoint, table):
38+
"""Returns true for plain letters combined with one or more marks."""
39+
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
40+
return len(codepoint.combining_ids) > 1 and \
41+
is_plain_letter(table[codepoint.combining_ids[0]]) and \
42+
all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
43+
44+
def is_letter(codepoint, table):
45+
"""Return true for letter with or without diacritical marks."""
46+
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
47+
48+
def get_plain_letter(codepoint, table):
49+
"""Return the base codepoint without marks."""
50+
if is_letter_with_marks(codepoint, table):
51+
return table[codepoint.combining_ids[0]]
52+
elif is_plain_letter(codepoint):
53+
return codepoint
54+
else:
55+
raise "mu"
56+
57+
def is_ligature(codepoint, table):
58+
"""Return true for letters combined with letters."""
59+
return all(is_letter(table[i], table) for i in codepoint.combining_ids)
60+
61+
def get_plain_letters(codepoint, table):
62+
"""Return a list of plain letters from a ligature."""
63+
assert(is_ligature(codepoint, table))
64+
return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
65+
66+
def main(expand_ligatures):
67+
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
68+
decomposition_type_pattern = re.compile(" *<[^>]*> *")
69+
70+
table = {}
71+
all = []
72+
73+
# read everything we need into memory
74+
for line in sys.stdin.readlines():
75+
fields = line.split(";")
76+
if len(fields) > 5:
77+
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
78+
general_category = fields[2]
79+
decomposition = fields[5]
80+
decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
81+
id = int(fields[0], 16)
82+
combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
83+
codepoint = Codepoint(id, general_category, combining_ids)
84+
table[id] = codepoint
85+
all.append(codepoint)
86+
87+
# walk through all the codepoints looking for interesting mappings
88+
for codepoint in all:
89+
if codepoint.general_category.startswith('L') and \
90+
len(codepoint.combining_ids) > 1:
91+
if is_letter_with_marks(codepoint, table):
92+
print_record(codepoint.id,
93+
chr(get_plain_letter(codepoint, table).id))
94+
elif expand_ligatures and is_ligature(codepoint, table):
95+
print_record(codepoint.id,
96+
"".join(unichr(combining_codepoint.id)
97+
for combining_codepoint \
98+
in get_plain_letters(codepoint, table)))
99+
100+
# some special cases
101+
print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
102+
print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
103+
print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
104+
print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
105+
print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
106+
print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
107+
print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
108+
print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
109+
print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
110+
print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
111+
print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
112+
print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
113+
print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
114+
print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
115+
if expand_ligatures:
116+
print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
117+
print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
118+
print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
119+
print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
120+
print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
121+
122+
if __name__ == "__main__":
123+
main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")

0 commit comments

Comments
 (0)