Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit ec0a69e

Browse files
committed
Extend the default rules file for contrib/unaccent with Vietnamese letters.
Improve generate_unaccent_rules.py to handle composed characters whose base is another composed character rather than a plain letter. The net effect of this is to add a bunch of multi-accented Vietnamese characters to unaccent.rules. Original complaint from Kha Nguyen, diagnosis of the script's shortcoming by Thomas Munro. Dang Minh Huong and Michael Paquier Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com
1 parent 2b74303 commit ec0a69e

File tree

2 files changed

+145
-8
lines changed

2 files changed

+145
-8
lines changed

contrib/unaccent/generate_unaccent_rules.py

+31-8
Original file line numberDiff line numberDiff line change
@@ -48,24 +48,47 @@ def is_mark(codepoint):
4848
return codepoint.general_category in ("Mn", "Me", "Mc")
4949

5050
def is_letter_with_marks(codepoint, table):
51-
"""Returns true for plain letters combined with one or more marks."""
51+
"""Returns true for letters combined with one or more marks."""
5252
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
53-
return len(codepoint.combining_ids) > 1 and \
54-
is_plain_letter(table[codepoint.combining_ids[0]]) and \
55-
all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
53+
54+
# Letter may have no combining characters, in which case it has
55+
# no marks.
56+
if len(codepoint.combining_ids) == 1:
57+
return False
58+
59+
# A letter without diacritical marks has none of them.
60+
if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
61+
return False
62+
63+
# Check if the base letter of this letter has marks.
64+
codepoint_base = codepoint.combining_ids[0]
65+
if (is_plain_letter(table[codepoint_base]) is False and \
66+
is_letter_with_marks(table[codepoint_base], table) is False):
67+
return False
68+
69+
return True
5670

5771
def is_letter(codepoint, table):
5872
"""Return true for letter with or without diacritical marks."""
5973
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
6074

6175
def get_plain_letter(codepoint, table):
62-
"""Return the base codepoint without marks."""
76+
"""Return the base codepoint without marks. If this codepoint has more
77+
than one combining character, do a recursive lookup on the table to
78+
find out its plain base letter."""
6379
if is_letter_with_marks(codepoint, table):
64-
return table[codepoint.combining_ids[0]]
80+
if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
81+
return get_plain_letter(table[codepoint.combining_ids[0]], table)
82+
elif is_plain_letter(table[codepoint.combining_ids[0]]):
83+
return table[codepoint.combining_ids[0]]
84+
85+
# Should not come here
86+
assert(False)
6587
elif is_plain_letter(codepoint):
6688
return codepoint
67-
else:
68-
raise "mu"
89+
90+
# Should not come here
91+
assert(False)
6992

7093
def is_ligature(codepoint, table):
7194
"""Return true for letters combined with letters."""

contrib/unaccent/unaccent.rules

+114
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,18 @@
254254
ǒ o
255255
Ǔ U
256256
ǔ u
257+
Ǖ U
258+
ǖ u
259+
Ǘ U
260+
ǘ u
261+
Ǚ U
262+
ǚ u
263+
Ǜ U
264+
ǜ u
265+
Ǟ A
266+
ǟ a
267+
Ǡ A
268+
ǡ a
257269
Ǥ G
258270
ǥ g
259271
Ǧ G
@@ -262,6 +274,8 @@
262274
ǩ k
263275
Ǫ O
264276
ǫ o
277+
Ǭ O
278+
ǭ o
265279
ǰ j
266280
DZ DZ
267281
Dz Dz
@@ -270,6 +284,8 @@
270284
ǵ g
271285
Ǹ N
272286
ǹ n
287+
Ǻ A
288+
ǻ a
273289
Ȁ A
274290
ȁ a
275291
Ȃ A
@@ -307,8 +323,14 @@
307323
ȧ a
308324
Ȩ E
309325
ȩ e
326+
Ȫ O
327+
ȫ o
328+
Ȭ O
329+
ȭ o
310330
Ȯ O
311331
ȯ o
332+
Ȱ O
333+
ȱ o
312334
Ȳ Y
313335
ȳ y
314336
ȴ l
@@ -441,6 +463,8 @@
441463
ḅ b
442464
Ḇ B
443465
ḇ b
466+
Ḉ C
467+
ḉ c
444468
Ḋ D
445469
ḋ d
446470
Ḍ D
@@ -451,10 +475,16 @@
451475
ḑ d
452476
Ḓ D
453477
ḓ d
478+
Ḕ E
479+
ḕ e
480+
Ḗ E
481+
ḗ e
454482
Ḙ E
455483
ḙ e
456484
Ḛ E
457485
ḛ e
486+
Ḝ E
487+
ḝ e
458488
Ḟ F
459489
ḟ f
460490
Ḡ G
@@ -471,6 +501,8 @@
471501
ḫ h
472502
Ḭ I
473503
ḭ i
504+
Ḯ I
505+
ḯ i
474506
Ḱ K
475507
ḱ k
476508
Ḳ K
@@ -479,6 +511,8 @@
479511
ḵ k
480512
Ḷ L
481513
ḷ l
514+
Ḹ L
515+
ḹ l
482516
Ḻ L
483517
ḻ l
484518
Ḽ L
@@ -497,6 +531,14 @@
497531
ṉ n
498532
Ṋ N
499533
ṋ n
534+
Ṍ O
535+
ṍ o
536+
Ṏ O
537+
ṏ o
538+
Ṑ O
539+
ṑ o
540+
Ṓ O
541+
ṓ o
500542
Ṕ P
501543
ṕ p
502544
Ṗ P
@@ -505,12 +547,20 @@
505547
ṙ r
506548
Ṛ R
507549
ṛ r
550+
Ṝ R
551+
ṝ r
508552
Ṟ R
509553
ṟ r
510554
Ṡ S
511555
ṡ s
512556
Ṣ S
513557
ṣ s
558+
Ṥ S
559+
ṥ s
560+
Ṧ S
561+
ṧ s
562+
Ṩ S
563+
ṩ s
514564
Ṫ T
515565
ṫ t
516566
Ṭ T
@@ -525,6 +575,10 @@
525575
ṵ u
526576
Ṷ U
527577
ṷ u
578+
Ṹ U
579+
ṹ u
580+
Ṻ U
581+
ṻ u
528582
Ṽ V
529583
ṽ v
530584
Ṿ V
@@ -563,12 +617,42 @@
563617
ạ a
564618
Ả A
565619
ả a
620+
Ấ A
621+
ấ a
622+
Ầ A
623+
ầ a
624+
Ẩ A
625+
ẩ a
626+
Ẫ A
627+
ẫ a
628+
Ậ A
629+
ậ a
630+
Ắ A
631+
ắ a
632+
Ằ A
633+
ằ a
634+
Ẳ A
635+
ẳ a
636+
Ẵ A
637+
ẵ a
638+
Ặ A
639+
ặ a
566640
Ẹ E
567641
ẹ e
568642
Ẻ E
569643
ẻ e
570644
Ẽ E
571645
ẽ e
646+
Ế E
647+
ế e
648+
Ề E
649+
ề e
650+
Ể E
651+
ể e
652+
Ễ E
653+
ễ e
654+
Ệ E
655+
ệ e
572656
Ỉ I
573657
ỉ i
574658
Ị I
@@ -577,10 +661,40 @@
577661
ọ o
578662
Ỏ O
579663
ỏ o
664+
Ố O
665+
ố o
666+
Ồ O
667+
ồ o
668+
Ổ O
669+
ổ o
670+
Ỗ O
671+
ỗ o
672+
Ộ O
673+
ộ o
674+
Ớ O
675+
ớ o
676+
Ờ O
677+
ờ o
678+
Ở O
679+
ở o
680+
Ỡ O
681+
ỡ o
682+
Ợ O
683+
ợ o
580684
Ụ U
581685
ụ u
582686
Ủ U
583687
ủ u
688+
Ứ U
689+
ứ u
690+
Ừ U
691+
ừ u
692+
Ử U
693+
ử u
694+
Ữ U
695+
ữ u
696+
Ự U
697+
ự u
584698
Ỳ Y
585699
ỳ y
586700
Ỵ Y

0 commit comments

Comments
 (0)