Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0bd9c62

Browse files
committed
Treat Unicode codepoints of category "Format" as non-spacing
Commit d8594d1 updated the list of non-spacing codepoints used for calculating display width, but in doing so inadvertently removed some, since the script used for that commit only considered combining characters. For complete coverage for zero-width characters, include codepoints in the category Cf (Format). To reflect the wider purpose, also rename files and update comments that referred specifically to combining characters. Some of these ranges have been missing since v12, but due to lack of field complaints it was determined not important enough to justify adding special-case logic the backbranches. Kyotaro Horiguchi Report by Pavel Stehule Discussion: https://www.postgresql.org/message-id/flat/CAFj8pRBE8yvpQ0FSkPCoe0Ny1jAAsAQ6j3qMgVwWvkqAoaaNmQ%40mail.gmail.com
1 parent bb629c2 commit 0bd9c62

File tree

4 files changed

+34
-23
lines changed

4 files changed

+34
-23
lines changed

src/common/unicode/Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS)
1818
# By default, do nothing.
1919
all:
2020

21-
update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
21+
update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
2222
mv $^ $(top_srcdir)/src/include/common/
2323
$(MAKE) normalization-check
2424

@@ -35,7 +35,7 @@ unicode_norm_hashfunc.h: unicode_norm_table.h
3535
unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt
3636
$(PERL) $<
3737

38-
unicode_combining_table.h: generate-unicode_combining_table.pl UnicodeData.txt
38+
unicode_nonspacing_table.h: generate-unicode_nonspacing_table.pl UnicodeData.txt
3939
$(PERL) $^ >$@
4040

4141
unicode_east_asian_fw_table.h: generate-unicode_east_asian_fw_table.pl EastAsianWidth.txt

src/common/unicode/generate-unicode_combining_table.pl renamed to src/common/unicode/generate-unicode_nonspacing_table.pl

+7-5
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,29 @@
1515
my $count = 0;
1616

1717
print
18-
"/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */\n\n";
18+
"/* generated by src/common/unicode/generate-unicode_nonspacing_table.pl, do not edit */\n\n";
1919

20-
print "static const struct mbinterval combining[] = {\n";
20+
print "static const struct mbinterval nonspacing[] = {\n";
2121

2222
foreach my $line (<ARGV>)
2323
{
2424
chomp $line;
2525
my @fields = split ';', $line;
2626
$codepoint = hex $fields[0];
2727

28-
if ($fields[2] eq 'Me' || $fields[2] eq 'Mn')
28+
# Me and Mn refer to combining characters
29+
# Cf refers to format characters
30+
if ($fields[2] eq 'Me' || $fields[2] eq 'Mn' || $fields[2] eq 'Cf')
2931
{
30-
# combining character, save for start of range
32+
# non-spacing character, save for start of range
3133
if (!defined($range_start))
3234
{
3335
$range_start = $codepoint;
3436
}
3537
}
3638
else
3739
{
38-
# not a combining character, print out previous range if any
40+
# not a non-spacing character, print out previous range if any
3941
if (defined($range_start))
4042
{
4143
printf "\t{0x%04X, 0x%04X},\n", $range_start, $prev_codepoint;

src/common/wchar.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,7 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
620620
* value of -1.
621621
*
622622
* - Non-spacing and enclosing combining characters (general
623-
* category code Mn or Me in the Unicode database) have a
623+
* category code Mn, Me or Cf in the Unicode database) have a
624624
* column width of 0.
625625
*
626626
* - Spacing characters in the East Asian Wide (W) or East Asian
@@ -638,7 +638,7 @@ mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
638638
static int
639639
ucs_wcwidth(pg_wchar ucs)
640640
{
641-
#include "common/unicode_combining_table.h"
641+
#include "common/unicode_nonspacing_table.h"
642642
#include "common/unicode_east_asian_fw_table.h"
643643

644644
/* test for 8-bit control characters */
@@ -657,8 +657,8 @@ ucs_wcwidth(pg_wchar ucs)
657657
* factor for display width leads to the correct behavior, so do that
658658
* search first.
659659
*/
660-
if (mbbisearch(ucs, combining,
661-
sizeof(combining) / sizeof(struct mbinterval) - 1))
660+
if (mbbisearch(ucs, nonspacing,
661+
sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
662662
return 0;
663663

664664
/* binary search in table of wide characters */

src/include/common/unicode_combining_table.h renamed to src/include/common/unicode_nonspacing_table.h

+21-12
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,24 @@
1-
/* generated by src/common/unicode/generate-unicode_combining_table.pl, do not edit */
1+
/* generated by src/common/unicode/generate-unicode_nonspacing_table.pl, do not edit */
22

3-
static const struct mbinterval combining[] = {
3+
static const struct mbinterval nonspacing[] = {
4+
{0x00AD, 0x00AD},
45
{0x0300, 0x036F},
56
{0x0483, 0x0489},
67
{0x0591, 0x05BD},
78
{0x05BF, 0x05BF},
89
{0x05C1, 0x05C2},
910
{0x05C4, 0x05C5},
1011
{0x05C7, 0x05C7},
12+
{0x0600, 0x0605},
1113
{0x0610, 0x061A},
14+
{0x061C, 0x061C},
1215
{0x064B, 0x065F},
1316
{0x0670, 0x0670},
14-
{0x06D6, 0x06DC},
17+
{0x06D6, 0x06DD},
1518
{0x06DF, 0x06E4},
1619
{0x06E7, 0x06E8},
1720
{0x06EA, 0x06ED},
21+
{0x070F, 0x070F},
1822
{0x0711, 0x0711},
1923
{0x0730, 0x074A},
2024
{0x07A6, 0x07B0},
@@ -25,9 +29,8 @@ static const struct mbinterval combining[] = {
2529
{0x0825, 0x0827},
2630
{0x0829, 0x082D},
2731
{0x0859, 0x085B},
28-
{0x0898, 0x089F},
29-
{0x08CA, 0x08E1},
30-
{0x08E3, 0x0902},
32+
{0x0890, 0x089F},
33+
{0x08CA, 0x0902},
3134
{0x093A, 0x093A},
3235
{0x093C, 0x093C},
3336
{0x0941, 0x0948},
@@ -114,8 +117,7 @@ static const struct mbinterval combining[] = {
114117
{0x17C6, 0x17C6},
115118
{0x17C9, 0x17D3},
116119
{0x17DD, 0x17DD},
117-
{0x180B, 0x180D},
118-
{0x180F, 0x180F},
120+
{0x180B, 0x180F},
119121
{0x1885, 0x1886},
120122
{0x18A9, 0x18A9},
121123
{0x1920, 0x1922},
@@ -152,6 +154,9 @@ static const struct mbinterval combining[] = {
152154
{0x1CF4, 0x1CF4},
153155
{0x1CF8, 0x1CF9},
154156
{0x1DC0, 0x1DFF},
157+
{0x200B, 0x200F},
158+
{0x202A, 0x202E},
159+
{0x2060, 0x206F},
155160
{0x20D0, 0x20F0},
156161
{0x2CEF, 0x2CF1},
157162
{0x2D7F, 0x2D7F},
@@ -196,6 +201,8 @@ static const struct mbinterval combining[] = {
196201
{0xFB1E, 0xFB1E},
197202
{0xFE00, 0xFE0F},
198203
{0xFE20, 0xFE2F},
204+
{0xFEFF, 0xFEFF},
205+
{0xFFF9, 0xFFFB},
199206
{0x101FD, 0x101FD},
200207
{0x102E0, 0x102E0},
201208
{0x10376, 0x1037A},
@@ -213,7 +220,8 @@ static const struct mbinterval combining[] = {
213220
{0x1107F, 0x11081},
214221
{0x110B3, 0x110B6},
215222
{0x110B9, 0x110BA},
216-
{0x110C2, 0x110C2},
223+
{0x110BD, 0x110BD},
224+
{0x110C2, 0x110CD},
217225
{0x11100, 0x11102},
218226
{0x11127, 0x1112B},
219227
{0x1112D, 0x11134},
@@ -281,15 +289,16 @@ static const struct mbinterval combining[] = {
281289
{0x11D95, 0x11D95},
282290
{0x11D97, 0x11D97},
283291
{0x11EF3, 0x11EF4},
292+
{0x13430, 0x13438},
284293
{0x16AF0, 0x16AF4},
285294
{0x16B30, 0x16B36},
286295
{0x16F4F, 0x16F4F},
287296
{0x16F8F, 0x16F92},
288297
{0x16FE4, 0x16FE4},
289298
{0x1BC9D, 0x1BC9E},
290-
{0x1CF00, 0x1CF46},
299+
{0x1BCA0, 0x1CF46},
291300
{0x1D167, 0x1D169},
292-
{0x1D17B, 0x1D182},
301+
{0x1D173, 0x1D182},
293302
{0x1D185, 0x1D18B},
294303
{0x1D1AA, 0x1D1AD},
295304
{0x1D242, 0x1D244},
@@ -304,5 +313,5 @@ static const struct mbinterval combining[] = {
304313
{0x1E2EC, 0x1E2EF},
305314
{0x1E8D0, 0x1E8D6},
306315
{0x1E944, 0x1E94A},
307-
{0xE0100, 0xE01EF},
316+
{0xE0001, 0xE01EF},
308317
};

0 commit comments

Comments
 (0)