Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 456e371

Browse files
committed
Add combining characters to unaccent.rules.
Strip certain classes of combining characters, so that accents encoded this way are removed. Author: Hugh Ranalli Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f%40postgresql.org
1 parent 80579f9 commit 456e371

File tree

4 files changed

+157
-1
lines changed

4 files changed

+157
-1
lines changed

contrib/unaccent/expected/unaccent.out

+18
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ SELECT unaccent('˃˖˗˜');
3131
>+-~
3232
(1 row)
3333

34+
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
35+
unaccent
36+
----------
37+
A
38+
(1 row)
39+
3440
SELECT unaccent('unaccent', 'foobar');
3541
unaccent
3642
----------
@@ -55,6 +61,12 @@ SELECT unaccent('unaccent', '˃˖˗˜');
5561
>+-~
5662
(1 row)
5763

64+
SELECT unaccent('unaccent', 'À');
65+
unaccent
66+
----------
67+
A
68+
(1 row)
69+
5870
SELECT ts_lexize('unaccent', 'foobar');
5971
ts_lexize
6072
-----------
@@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', '˃˖˗˜');
7991
{>+-~}
8092
(1 row)
8193

94+
SELECT ts_lexize('unaccent', 'À');
95+
ts_lexize
96+
-----------
97+
{A}
98+
(1 row)
99+

contrib/unaccent/generate_unaccent_rules.py

+30-1
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,42 @@ def bytes(source, encoding='ascii', errors='strict'):
6161
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
6262
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
6363

64+
# Combining marks follow a "base" character, and result in a composite
65+
# character. Example: "U&'A\0300'"produces "À".There are three types of
66+
# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
67+
# combining (Mc). We identify the ranges of marks we feel safe removing.
68+
# References:
69+
# https://en.wikipedia.org/wiki/Combining_character
70+
# https://www.unicode.org/charts/PDF/U0300.pdf
71+
# https://www.unicode.org/charts/PDF/U20D0.pdf
72+
COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA
73+
(0x20dd, 0x20E0), # Me: Symbols
74+
(0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
75+
6476
def print_record(codepoint, letter):
65-
print (chr(codepoint) + "\t" + letter)
77+
if letter:
78+
output = chr(codepoint) + "\t" + letter
79+
else:
80+
output = chr(codepoint)
81+
82+
print(output)
6683

6784
class Codepoint:
6885
def __init__(self, id, general_category, combining_ids):
6986
self.id = id
7087
self.general_category = general_category
7188
self.combining_ids = combining_ids
7289

90+
def is_mark_to_remove(codepoint):
91+
"""Return true if this is a combining mark to remove."""
92+
if not is_mark(codepoint):
93+
return False
94+
95+
for begin, end in COMBINING_MARK_RANGES:
96+
if codepoint.id >= begin and codepoint.id <= end:
97+
return True
98+
return False
99+
73100
def is_plain_letter(codepoint):
74101
"""Return true if codepoint represents a "plain letter"."""
75102
for begin, end in PLAIN_LETTER_RANGES:
@@ -234,6 +261,8 @@ def main(args):
234261
"".join(chr(combining_codepoint.id)
235262
for combining_codepoint \
236263
in get_plain_letters(codepoint, table))))
264+
elif is_mark_to_remove(codepoint):
265+
charactersSet.add((codepoint.id, None))
237266

238267
# add CLDR Latin-ASCII characters
239268
if not args.noLigaturesExpansion:

contrib/unaccent/sql/unaccent.sql

+3
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,16 @@ SELECT unaccent('foobar');
99
SELECT unaccent('ёлка');
1010
SELECT unaccent('ЁЖИК');
1111
SELECT unaccent('˃˖˗˜');
12+
SELECT unaccent(''); -- Remove combining diacritical 0x0300
1213

1314
SELECT unaccent('unaccent', 'foobar');
1415
SELECT unaccent('unaccent', 'ёлка');
1516
SELECT unaccent('unaccent', 'ЁЖИК');
1617
SELECT unaccent('unaccent', '˃˖˗˜');
18+
SELECT unaccent('unaccent', '');
1719

1820
SELECT ts_lexize('unaccent', 'foobar');
1921
SELECT ts_lexize('unaccent', 'ёлка');
2022
SELECT ts_lexize('unaccent', 'ЁЖИК');
2123
SELECT ts_lexize('unaccent', '˃˖˗˜');
24+
SELECT ts_lexize('unaccent', '');

contrib/unaccent/unaccent.rules

+106
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,105 @@
414414
˖ +
415415
˗ -
416416
˜ ~
417+
̀
418+
́
419+
̂
420+
̃
421+
̄
422+
̅
423+
̆
424+
̇
425+
̈
426+
̉
427+
̊
428+
̋
429+
̌
430+
̍
431+
̎
432+
̏
433+
̐
434+
̑
435+
̒
436+
̓
437+
̔
438+
̕
439+
̖
440+
̗
441+
̘
442+
̙
443+
̚
444+
̛
445+
̜
446+
̝
447+
̞
448+
̟
449+
̠
450+
̡
451+
̢
452+
̣
453+
̤
454+
̥
455+
̦
456+
̧
457+
̨
458+
̩
459+
̪
460+
̫
461+
̬
462+
̭
463+
̮
464+
̯
465+
̰
466+
̱
467+
̲
468+
̳
469+
̴
470+
̵
471+
̶
472+
̷
473+
̸
474+
̹
475+
̺
476+
̻
477+
̼
478+
̽
479+
̾
480+
̿
481+
̀
482+
́
483+
͂
484+
̓
485+
̈́
486+
ͅ
487+
͆
488+
͇
489+
͈
490+
͉
491+
͊
492+
͋
493+
͌
494+
͍
495+
͎
496+
͏
497+
͐
498+
͑
499+
͒
500+
͓
501+
͔
502+
͕
503+
͖
504+
͗
505+
͘
506+
͙
507+
͚
508+
͛
509+
͜
510+
͝
511+
͞
512+
͟
513+
͠
514+
͡
515+
͢
417516
Ά Α
418517
Έ Ε
419518
Ή Η
@@ -982,6 +1081,13 @@
9821081
₧ Pts
9831082
₹ Rs
9841083
₺ TL
1084+
1085+
1086+
1087+
1088+
1089+
1090+
9851091
℀ a/c
9861092
℁ a/s
9871093
ℂ C

0 commit comments

Comments
 (0)