Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 80f8eb7

Browse files
committed
Use perfect hash for NFC and NFKC Unicode Normalization quick check
This makes the normalization quick check about 30% faster for NFC and 50% faster for NFKC than the binary search used previously. The hash lookup reuses the existing array of bit fields used for the binary search to get the quick check property and is generated as part of "make update-unicode" in src/common/unicode/. Author: John Naylor Reviewed-by: Mark Dilger, Michael Paquier Discussion: https://postgr.es/m/CACPNZCt4fbJ0_bGrN5QPt34N4whv=mszM0LMVQdoa2rC9UMRXA@mail.gmail.com
1 parent 85d08b8 commit 80f8eb7

File tree

5 files changed

+1681
-24
lines changed

5 files changed

+1681
-24
lines changed

src/common/unicode/generate-unicode_normprops_table.pl

+39-2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
use strict;
1010
use warnings;
1111

12+
use FindBin;
13+
use lib "$FindBin::RealBin/../../tools/";
14+
use PerfectHash;
15+
1216
my %data;
1317

1418
print
@@ -18,13 +22,25 @@
1822
#include "common/unicode_norm.h"
1923
2024
/*
21-
* We use a bit field here to save space.
25+
* Normalization quick check entry for codepoint. We use a bit field
26+
* here to save space.
2227
*/
2328
typedef struct
2429
{
2530
unsigned int codepoint:21;
2631
signed int quickcheck:4; /* really UnicodeNormalizationQC */
27-
} pg_unicode_normprops;
32+
} pg_unicode_normprops;
33+
34+
/* Typedef for hash function on quick check table */
35+
typedef int (*qc_hash_func) (const void *key);
36+
37+
/* Information for quick check lookup with perfect hash function */
38+
typedef struct
39+
{
40+
const pg_unicode_normprops *normprops;
41+
qc_hash_func hash;
42+
int num_normprops;
43+
} pg_unicode_norminfo;
2844
EOS
2945

3046
foreach my $line (<ARGV>)
@@ -66,6 +82,7 @@
6682
"static const pg_unicode_normprops UnicodeNormProps_${prop}[] = {\n";
6783

6884
my %subdata = %{ $data{$prop} };
85+
my @cp_packed;
6986
foreach my $cp (sort { $a <=> $b } keys %subdata)
7087
{
7188
my $qc;
@@ -82,7 +99,27 @@
8299
die;
83100
}
84101
printf "\t{0x%04X, %s},\n", $cp, $qc;
102+
103+
# Save the bytes as a string in network order.
104+
push @cp_packed, pack('N', $cp);
85105
}
86106

87107
print "};\n";
108+
109+
# Emit the definition of the perfect hash function.
110+
my $funcname = $prop . '_hash_func';
111+
my $f = PerfectHash::generate_hash_function(\@cp_packed, $funcname,
112+
fixed_key_length => 4);
113+
printf "\n/* Perfect hash function for %s */", $prop;
114+
print "\nstatic $f\n";
115+
116+
# Emit the structure that wraps the hash lookup information into
117+
# one variable.
118+
printf "/* Hash lookup information for %s */", $prop;
119+
printf "\nstatic const pg_unicode_norminfo ";
120+
printf "UnicodeNormInfo_%s = {\n", $prop;
121+
printf "\tUnicodeNormProps_%s,\n", $prop;
122+
printf "\t%s,\n", $funcname;
123+
printf "\t%d\n", scalar @cp_packed;
124+
printf "};\n";
88125
}

src/common/unicode_norm.c

+27-21
Original file line numberDiff line numberDiff line change
@@ -465,15 +465,32 @@ get_canonical_class(pg_wchar ch)
465465
return entry->comb_class;
466466
}
467467

468-
static int
469-
qc_compare(const void *p1, const void *p2)
468+
static const pg_unicode_normprops *
469+
qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
470470
{
471-
uint32 v1,
472-
v2;
471+
int h;
472+
uint32 hashkey;
473473

474-
v1 = ((const pg_unicode_normprops *) p1)->codepoint;
475-
v2 = ((const pg_unicode_normprops *) p2)->codepoint;
476-
return (v1 - v2);
474+
/*
475+
* Compute the hash function. The hash key is the codepoint with the bytes
476+
* in network order.
477+
*/
478+
hashkey = htonl(ch);
479+
h = norminfo->hash(&hashkey);
480+
481+
/* An out-of-range result implies no match */
482+
if (h < 0 || h >= norminfo->num_normprops)
483+
return NULL;
484+
485+
/*
486+
* Since it's a perfect hash, we need only match to the specific codepoint
487+
* it identifies.
488+
*/
489+
if (ch != norminfo->normprops[h].codepoint)
490+
return NULL;
491+
492+
/* Success! */
493+
return &norminfo->normprops[h];
477494
}
478495

479496
/*
@@ -482,26 +499,15 @@ qc_compare(const void *p1, const void *p2)
482499
static UnicodeNormalizationQC
483500
qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
484501
{
485-
pg_unicode_normprops key;
486-
pg_unicode_normprops *found = NULL;
487-
488-
key.codepoint = ch;
502+
const pg_unicode_normprops *found = NULL;
489503

490504
switch (form)
491505
{
492506
case UNICODE_NFC:
493-
found = bsearch(&key,
494-
UnicodeNormProps_NFC_QC,
495-
lengthof(UnicodeNormProps_NFC_QC),
496-
sizeof(pg_unicode_normprops),
497-
qc_compare);
507+
found = qc_hash_lookup(ch, &UnicodeNormInfo_NFC_QC);
498508
break;
499509
case UNICODE_NFKC:
500-
found = bsearch(&key,
501-
UnicodeNormProps_NFKC_QC,
502-
lengthof(UnicodeNormProps_NFKC_QC),
503-
sizeof(pg_unicode_normprops),
504-
qc_compare);
510+
found = qc_hash_lookup(ch, &UnicodeNormInfo_NFKC_QC);
505511
break;
506512
default:
507513
Assert(false);

0 commit comments

Comments
 (0)