Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 4e7f62b

Browse files
committed
Add support for Unicode case folding.
Expand case mapping tables to include entries for case folding, which are parsed from CaseFolding.txt. Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com
1 parent 7921927 commit 4e7f62b

File tree

7 files changed

+3280
-3125
lines changed

7 files changed

+3280
-3125
lines changed

src/common/unicode/Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,13 @@ update-unicode: unicode_case_table.h unicode_category_table.h unicode_east_asian
3030
# These files are part of the Unicode Character Database. Download
3131
# them on demand. The dependency on Makefile.global is for
3232
# UNICODE_VERSION.
33-
CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
33+
CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt: $(top_builddir)/src/Makefile.global
3434
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3535

3636
unicode_version.h: generate-unicode_version.pl
3737
$(PERL) $< --version $(UNICODE_VERSION)
3838

39-
unicode_case_table.h: generate-unicode_case_table.pl UnicodeData.txt
39+
unicode_case_table.h: generate-unicode_case_table.pl CaseFolding.txt UnicodeData.txt
4040
$(PERL) $<
4141

4242
unicode_category_table.h: generate-unicode_category_table.pl DerivedCoreProperties.txt PropList.txt UnicodeData.txt
@@ -91,4 +91,4 @@ clean:
9191
rm -f $(OBJS) case_test case_test.o category_test category_test.o norm_test norm_test.o
9292

9393
distclean: clean
94-
rm -f CompositionExclusions.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h
94+
rm -f CompositionExclusions.txt CaseFolding.txt DerivedCoreProperties.txt DerivedNormalizationProps.txt EastAsianWidth.txt NormalizationTest.txt PropList.txt SpecialCasing.txt UnicodeData.txt norm_test_table.h unicode_case_table.h unicode_category_table.h unicode_norm_table.h

src/common/unicode/case_test.c

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,17 +81,20 @@ icu_test_simple(pg_wchar code)
8181
pg_wchar lower = unicode_lowercase_simple(code);
8282
pg_wchar title = unicode_titlecase_simple(code);
8383
pg_wchar upper = unicode_uppercase_simple(code);
84+
pg_wchar fold = unicode_casefold_simple(code);
8485
pg_wchar iculower = u_tolower(code);
8586
pg_wchar icutitle = u_totitle(code);
8687
pg_wchar icuupper = u_toupper(code);
88+
pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
8789

88-
if (lower != iculower || title != icutitle || upper != icuupper)
90+
if (lower != iculower || title != icutitle || upper != icuupper ||
91+
fold != icufold)
8992
{
9093
printf("case_test: FAILURE for codepoint 0x%06x\n", code);
91-
printf("case_test: Postgres lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
92-
lower, title, upper);
93-
printf("case_test: ICU lower/title/upper: 0x%06x/0x%06x/0x%06x\n",
94-
iculower, icutitle, icuupper);
94+
printf("case_test: Postgres lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
95+
lower, title, upper, fold);
96+
printf("case_test: ICU lower/title/upper/fold: 0x%06x/0x%06x/0x%06x/0x%06x\n",
97+
iculower, icutitle, icuupper, icufold);
9598
printf("\n");
9699
exit(1);
97100
}
@@ -103,9 +106,11 @@ icu_test_full(char *str)
103106
char lower[BUFSZ];
104107
char title[BUFSZ];
105108
char upper[BUFSZ];
109+
char fold[BUFSZ];
106110
char icu_lower[BUFSZ];
107111
char icu_title[BUFSZ];
108112
char icu_upper[BUFSZ];
113+
char icu_fold[BUFSZ];
109114
UErrorCode status;
110115
struct WordBoundaryState wbstate = {
111116
.str = str,
@@ -118,12 +123,15 @@ icu_test_full(char *str)
118123
unicode_strlower(lower, BUFSZ, str, -1, true);
119124
unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate);
120125
unicode_strupper(upper, BUFSZ, str, -1, true);
126+
unicode_strfold(fold, BUFSZ, str, -1, true);
121127
status = U_ZERO_ERROR;
122128
ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status);
123129
status = U_ZERO_ERROR;
124130
ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status);
125131
status = U_ZERO_ERROR;
126132
ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status);
133+
status = U_ZERO_ERROR;
134+
ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status);
127135

128136
if (strcmp(lower, icu_lower) != 0)
129137
{
@@ -143,6 +151,12 @@ icu_test_full(char *str)
143151
icu_upper);
144152
exit(1);
145153
}
154+
if (strcmp(fold, icu_fold) != 0)
155+
{
156+
printf("case_test: str='%s' fold='%s' icu_fold='%s'\n", str, fold,
157+
icu_fold);
158+
exit(1);
159+
}
146160
}
147161

148162
/*
@@ -302,6 +316,12 @@ tfunc_upper(char *dst, size_t dstsize, const char *src,
302316
return unicode_strupper(dst, dstsize, src, srclen, true);
303317
}
304318

319+
static size_t
320+
tfunc_fold(char *dst, size_t dstsize, const char *src,
321+
ssize_t srclen)
322+
{
323+
return unicode_strfold(dst, dstsize, src, srclen, true);
324+
}
305325

306326
static void
307327
test_convert_case()
@@ -318,10 +338,12 @@ test_convert_case()
318338
test_convert(tfunc_upper, "ß", "SS");
319339
test_convert(tfunc_lower, "ıiIİ", "ıiii\u0307");
320340
test_convert(tfunc_upper, "ıiIİ", "IIIİ");
341+
test_convert(tfunc_fold, "ıiIİ", "ıiii\u0307");
321342
/* test final sigma */
322343
test_convert(tfunc_lower, "σςΣ ΣΣΣ", "σςς σσς");
323344
test_convert(tfunc_lower, "σς'Σ' ΣΣ'Σ'", "σς'ς' σσ'ς'");
324345
test_convert(tfunc_title, "σςΣ ΣΣΣ", "Σςς Σσς");
346+
test_convert(tfunc_fold, "σςΣ ΣΣΣ", "σσσ σσσ");
325347

326348
#ifdef USE_ICU
327349
icu_test_full("");

src/common/unicode/generate-unicode_case_table.pl

Lines changed: 103 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@
4949
$simple{$code} = {
5050
Simple_Lowercase => ($simple_lowercase || $code),
5151
Simple_Titlecase => ($simple_titlecase || $code),
52-
Simple_Uppercase => ($simple_uppercase || $code)
52+
Simple_Uppercase => ($simple_uppercase || $code),
53+
Simple_Foldcase => $code,
5354
};
5455
}
5556
}
@@ -87,6 +88,7 @@
8788
my @lower = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[1]));
8889
my @title = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[2]));
8990
my @upper = map { hex $_ } (grep /^[0-9A-F]+$/, (split /\s+/, $elts[3]));
91+
my @fold = ();
9092
my @conditions = map {
9193
# supporting negated conditions may require storing a
9294
# mask of relevant conditions for a given rule to differentiate
@@ -101,6 +103,7 @@
101103
push @lower, $code if (scalar @lower == 0);
102104
push @title, $code if (scalar @title == 0);
103105
push @upper, $code if (scalar @upper == 0);
106+
push @fold, $code;
104107

105108
# none should map to more than 3 codepoints
106109
die "lowercase expansion for 0x$elts[0] exceeds maximum: '$elts[1]'"
@@ -114,13 +117,15 @@
114117
while (scalar @upper < $MAX_CASE_EXPANSION) { push @upper, 0x000000 }
115118
while (scalar @lower < $MAX_CASE_EXPANSION) { push @lower, 0x000000 }
116119
while (scalar @title < $MAX_CASE_EXPANSION) { push @title, 0x000000 }
120+
while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 }
117121

118122
# Characters with special mappings may not have simple mappings;
119123
# ensure that an entry exists.
120124
$simple{$code} ||= {
121125
Simple_Lowercase => $code,
122126
Simple_Titlecase => $code,
123-
Simple_Uppercase => $code
127+
Simple_Uppercase => $code,
128+
Simple_Foldcase => $code
124129
};
125130

126131
# Multiple special case rules for a single codepoint could be
@@ -135,11 +140,96 @@
135140
Lowercase => \@lower,
136141
Titlecase => \@title,
137142
Uppercase => \@upper,
143+
Foldcase => \@fold,
138144
Conditions => $cond_str
139145
};
140146
}
141147
close $FH;
142148

149+
open($FH, '<', "$output_path/CaseFolding.txt")
150+
or die "Could not open $output_path/CaseFolding.txt: $!.";
151+
while (my $line = <$FH>)
152+
{
153+
# remove comments
154+
$line =~ s/^(.*?)#.*$/$1/s;
155+
156+
# ignore empty lines
157+
next unless $line =~ /;/;
158+
159+
my @elts = split(';', $line);
160+
my $code = hex($elts[0]);
161+
my $status = $elts[1] =~ s/^\s+|\s+$//rg;
162+
163+
# Codepoint may map to multiple characters when folding. Split
164+
# each mapping on whitespace and extract the hexadecimal into an
165+
# array of codepoints.
166+
my @fold = map { hex $_ } (grep /[0-9A-F]+/, (split /\s+/, $elts[2]));
167+
168+
die "codepoint $code out of range" if $code > 0x10FFFF;
169+
170+
# status 'T' unsupported; skip
171+
next if $status eq 'T';
172+
173+
# encountered unrecognized status type
174+
die "unsupported status type '$status'"
175+
if $status ne 'S' && $status ne 'C' && $status ne 'F';
176+
177+
# initialize simple case mappings if they don't exist
178+
$simple{$code} ||= {
179+
Simple_Lowercase => $code,
180+
Simple_Titlecase => $code,
181+
Simple_Uppercase => $code,
182+
Simple_Foldcase => $code
183+
};
184+
185+
if ($status eq 'S' || $status eq 'C')
186+
{
187+
die
188+
"Simple case folding for $code has multiple codepoints: '$line' '$elts[2]'"
189+
if scalar @fold != 1;
190+
my $simple_foldcase = $fold[0];
191+
192+
die "Simple_Foldcase $code out of range"
193+
if $simple_foldcase > 0x10FFFF;
194+
195+
$simple{$code}{Simple_Foldcase} = $simple_foldcase;
196+
}
197+
198+
if ($status eq 'F' || ($status eq 'C' && defined $special{$code}))
199+
{
200+
while (scalar @fold < $MAX_CASE_EXPANSION) { push @fold, 0x000000 }
201+
202+
#initialize special case mappings if they don't exist
203+
if (!defined $special{$code})
204+
{
205+
my @lower = ($simple{$code}{Simple_Lowercase});
206+
my @title = ($simple{$code}{Simple_Titlecase});
207+
my @upper = ($simple{$code}{Simple_Uppercase});
208+
while (scalar @lower < $MAX_CASE_EXPANSION)
209+
{
210+
push @lower, 0x000000;
211+
}
212+
while (scalar @title < $MAX_CASE_EXPANSION)
213+
{
214+
push @title, 0x000000;
215+
}
216+
while (scalar @upper < $MAX_CASE_EXPANSION)
217+
{
218+
push @upper, 0x000000;
219+
}
220+
$special{$code} = {
221+
Lowercase => \@lower,
222+
Titlecase => \@title,
223+
Uppercase => \@upper,
224+
Conditions => '0'
225+
};
226+
}
227+
228+
$special{$code}{Foldcase} = \@fold;
229+
}
230+
}
231+
close $FH;
232+
143233
# assign sequential array indexes to the special mappings
144234
my $special_idx = 0;
145235
foreach my $code (sort { $a <=> $b } (keys %special))
@@ -202,6 +292,7 @@
202292
CaseLower = 0,
203293
CaseTitle = 1,
204294
CaseUpper = 2,
295+
CaseFold = 3,
205296
NCaseKind
206297
} CaseKind;
207298
@@ -232,14 +323,17 @@
232323
die if scalar @{ $special{$code}{Lowercase} } != $MAX_CASE_EXPANSION;
233324
die if scalar @{ $special{$code}{Titlecase} } != $MAX_CASE_EXPANSION;
234325
die if scalar @{ $special{$code}{Uppercase} } != $MAX_CASE_EXPANSION;
326+
die if scalar @{ $special{$code}{Foldcase} } != $MAX_CASE_EXPANSION;
235327
my $lower = join ", ",
236328
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Lowercase} });
237329
my $title = join ", ",
238330
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Titlecase} });
239331
my $upper = join ", ",
240332
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Uppercase} });
333+
my $fold = join ", ",
334+
(map { sprintf "0x%06x", $_ } @{ $special{$code}{Foldcase} });
241335
printf $OT "\t{0x%06x, %s, ", $code, $special{$code}{Conditions};
242-
printf $OT "{{%s}, {%s}, {%s}}},\n", $lower, $title, $upper;
336+
printf $OT "{{%s}, {%s}, {%s}, {%s}}},\n", $lower, $title, $upper, $fold;
243337
}
244338

245339
print $OT "\t{0, 0, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}}\n";
@@ -260,11 +354,13 @@
260354
my $lc = ($simple{$code}{Simple_Lowercase} || $code);
261355
my $tc = ($simple{$code}{Simple_Titlecase} || $code);
262356
my $uc = ($simple{$code}{Simple_Uppercase} || $code);
357+
my $fc = ($simple{$code}{Simple_Foldcase} || $code);
358+
263359
die "unexpected special case for code $code"
264360
if defined $special{$code};
265361
printf $OT
266-
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, NULL},\n",
267-
$code, $lc, $tc, $uc;
362+
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, NULL},\n",
363+
$code, $lc, $tc, $uc, $fc;
268364
}
269365
printf $OT "\n";
270366

@@ -280,8 +376,8 @@
280376
$special_case = sprintf "&special_case[%d]", $special{$code}{Index};
281377
}
282378
printf $OT
283-
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x}, %s},\n",
379+
"\t{0x%06x, {[CaseLower] = 0x%06x,[CaseTitle] = 0x%06x,[CaseUpper] = 0x%06x,[CaseFold] = 0x%06x}, %s},\n",
284380
$code, $map->{Simple_Lowercase}, $map->{Simple_Titlecase},
285-
$map->{Simple_Uppercase}, $special_case;
381+
$map->{Simple_Uppercase}, $map->{Simple_Foldcase}, $special_case;
286382
}
287383
print $OT "};\n";

src/common/unicode/meson.build

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ endif
1111

1212
# These files are part of the Unicode Character Database. Download them on
1313
# demand.
14-
foreach f : ['CompositionExclusions.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
14+
foreach f : ['CompositionExclusions.txt', 'CaseFolding.txt', 'DerivedCoreProperties.txt', 'DerivedNormalizationProps.txt', 'EastAsianWidth.txt', 'NormalizationTest.txt', 'PropList.txt', 'SpecialCasing.txt', 'UnicodeData.txt']
1515
url = unicode_baseurl.format(UNICODE_VERSION, f)
1616
target = custom_target(f,
1717
output: f,
@@ -26,7 +26,7 @@ update_unicode_targets = []
2626

2727
update_unicode_targets += \
2828
custom_target('unicode_case_table.h',
29-
input: [unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
29+
input: [unicode_data['CaseFolding.txt'], unicode_data['SpecialCasing.txt'], unicode_data['UnicodeData.txt']],
3030
output: ['unicode_case_table.h'],
3131
command: [
3232
perl, files('generate-unicode_case_table.pl'),

src/common/unicode_case.c

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ unicode_uppercase_simple(pg_wchar code)
5151
return map ? map->simplemap[CaseUpper] : code;
5252
}
5353

54+
pg_wchar
55+
unicode_casefold_simple(pg_wchar code)
56+
{
57+
const pg_case_map *map = find_case_map(code);
58+
59+
return map ? map->simplemap[CaseFold] : code;
60+
}
61+
5462
/*
5563
* unicode_strlower()
5664
*
@@ -142,6 +150,30 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
142150
NULL);
143151
}
144152

153+
/*
154+
* unicode_strfold()
155+
*
156+
* Case fold src, and return the result length (not including terminating
157+
* NUL).
158+
*
159+
* String src must be encoded in UTF-8. If srclen < 0, src must be
160+
* NUL-terminated.
161+
*
162+
* Result string is stored in dst, truncating if larger than dstsize. If
163+
* dstsize is greater than the result length, dst will be NUL-terminated;
164+
* otherwise not.
165+
*
166+
* If dstsize is zero, dst may be NULL. This is useful for calculating the
167+
* required buffer size before allocating.
168+
*/
169+
size_t
170+
unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
171+
bool full)
172+
{
173+
return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
174+
NULL);
175+
}
176+
145177
/*
146178
* Implement Unicode Default Case Conversion algorithm.
147179
*

src/include/common/unicode_case.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@ typedef size_t (*WordBoundaryNext) (void *wbstate);
2121
pg_wchar unicode_lowercase_simple(pg_wchar code);
2222
pg_wchar unicode_titlecase_simple(pg_wchar code);
2323
pg_wchar unicode_uppercase_simple(pg_wchar code);
24+
pg_wchar unicode_casefold_simple(pg_wchar code);
2425
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
2526
ssize_t srclen, bool full);
2627
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
2728
ssize_t srclen, bool full,
2829
WordBoundaryNext wbnext, void *wbstate);
2930
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
3031
ssize_t srclen, bool full);
32+
size_t unicode_strfold(char *dst, size_t dstsize, const char *src,
33+
ssize_t srclen, bool full);
3134

3235
#endif /* UNICODE_CASE_H */

0 commit comments

Comments
 (0)