Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 2dbbf33

Browse files
committed
Add seven kanji characters defined in the Windows 950 codepage to our
big5/win950 <-> UTF8 conversion tables. Per report by Roger Chang.
1 parent 8e1a8fe commit 2dbbf33

File tree

4 files changed

+195
-5
lines changed

4 files changed

+195
-5
lines changed
+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#! /usr/bin/perl
2+
#
3+
# Copyright (c) 2001-2009, PostgreSQL Global Development Group
4+
#
5+
# $PostgreSQL: pgsql/src/backend/utils/mb/Unicode/UCS_to_big5.pl,v 1.1 2009/03/18 16:17:58 heikki Exp $
6+
#
7+
# Generate UTF-8 <--> BIG5 conversion tables from
8+
# map files provided by Unicode organization.
9+
# Unfortunately it is prohibited by the organization
10+
# to distribute the map files. So if you try to use this script,
11+
# you have to obtain the map files from the organization's ftp site.
12+
# ftp://www.unicode.org/Public/MAPPINGS/
13+
#
14+
# Our "big5" comes from BIG5.TXT, with the addition of the characters
15+
# in the range 0xf9d6-0xf9dc from CP950.TXT.
16+
#
17+
# BIG5.TXT format:
18+
# BIG5 code in hex
19+
# UCS-2 code in hex
20+
# # and Unicode name (not used in this script)
21+
#
22+
# CP950.TXT format:
23+
# CP950 code in hex
24+
# UCS-2 code in hex
25+
# # and Unicode name (not used in this script)
26+
27+
28+
require "ucs2utf.pl";
29+
30+
31+
#
32+
# first, generate UTF8 --> BIG5 table
33+
#
34+
$in_file = "BIG5.TXT";
35+
36+
open( FILE, $in_file ) || die( "cannot open $in_file" );
37+
38+
reset 'array';
39+
40+
while( <FILE> ){
41+
chop;
42+
if( /^#/ ){
43+
next;
44+
}
45+
( $c, $u, $rest ) = split;
46+
$ucs = hex($u);
47+
$code = hex($c);
48+
if( $code >= 0x80 && $ucs >= 0x0080){
49+
$utf = &ucs2utf($ucs);
50+
if( $array{ $utf } ne "" ){
51+
printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
52+
next;
53+
}
54+
$count++;
55+
$array{ $utf } = $code;
56+
}
57+
}
58+
close( FILE );
59+
60+
$in_file = "CP950.TXT";
61+
62+
open( FILE, $in_file ) || die( "cannot open $in_file" );
63+
64+
while( <FILE> ){
65+
chop;
66+
if( /^#/ ){
67+
next;
68+
}
69+
( $c, $u, $rest ) = split;
70+
$ucs = hex($u);
71+
$code = hex($c);
72+
73+
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
74+
# from CP950.TXT
75+
if( $code >= 0x80 && $ucs >= 0x0080 &&
76+
$code >= 0xf9d6 && $code <= 0xf9dc ){
77+
$utf = &ucs2utf($ucs);
78+
if( $array{ $utf } ne "" ){
79+
printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
80+
next;
81+
}
82+
$count++;
83+
$array{ $utf } = $code;
84+
}
85+
}
86+
close( FILE );
87+
88+
$file = lc("utf8_to_big5.map");
89+
open( FILE, "> $file" ) || die( "cannot open $file" );
90+
print FILE "static pg_utf_to_local ULmapBIG5[ $count ] = {\n";
91+
92+
for $index ( sort {$a <=> $b} keys( %array ) ){
93+
$code = $array{ $index };
94+
$count--;
95+
if( $count == 0 ){
96+
printf FILE " {0x%04x, 0x%04x}\n", $index, $code;
97+
} else {
98+
printf FILE " {0x%04x, 0x%04x},\n", $index, $code;
99+
}
100+
}
101+
102+
print FILE "};\n";
103+
close(FILE);
104+
105+
#
106+
# then generate BIG5 --> UTF8 table
107+
#
108+
$in_file = "BIG5.TXT";
109+
110+
open( FILE, $in_file ) || die( "cannot open $in_file" );
111+
112+
reset 'array';
113+
114+
while( <FILE> ){
115+
chop;
116+
if( /^#/ ){
117+
next;
118+
}
119+
( $c, $u, $rest ) = split;
120+
$ucs = hex($u);
121+
$code = hex($c);
122+
if( $code >= 0x80 && $ucs >= 0x0080){
123+
$utf = &ucs2utf($ucs);
124+
if( $array{ $utf } ne "" ){
125+
printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
126+
next;
127+
}
128+
$count++;
129+
$array{ $code } = $utf;
130+
}
131+
}
132+
close( FILE );
133+
134+
$in_file = "CP950.TXT";
135+
136+
open( FILE, $in_file ) || die( "cannot open $in_file" );
137+
138+
while( <FILE> ){
139+
chop;
140+
if( /^#/ ){
141+
next;
142+
}
143+
( $c, $u, $rest ) = split;
144+
$ucs = hex($u);
145+
$code = hex($c);
146+
147+
# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
148+
# from CP950.TXT
149+
if( $code >= 0x80 && $ucs >= 0x0080 &&
150+
$code >= 0xf9d6 && $code <= 0xf9dc ){
151+
$utf = &ucs2utf($ucs);
152+
if( $array{ $utf } ne "" ){
153+
printf STDERR "Warning: duplicate UTF8: %04x\n",$ucs;
154+
next;
155+
}
156+
$count++;
157+
$array{ $code } = $utf;
158+
}
159+
}
160+
close( FILE );
161+
162+
$file = lc("big5_to_utf8.map");
163+
open( FILE, "> $file" ) || die( "cannot open $file" );
164+
print FILE "static pg_local_to_utf LUmapBIG5[ $count ] = {\n";
165+
for $index ( sort {$a <=> $b} keys( %array ) ){
166+
$utf = $array{ $index };
167+
$count--;
168+
if( $count == 0 ){
169+
printf FILE " {0x%04x, 0x%04x}\n", $index, $utf;
170+
} else {
171+
printf FILE " {0x%04x, 0x%04x},\n", $index, $utf;
172+
}
173+
}
174+
175+
print FILE "};\n";
176+
close(FILE);
177+

src/backend/utils/mb/Unicode/UCS_to_most.pl

+1-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# Copyright (c) 2001-2009, PostgreSQL Global Development Group
44
#
5-
# $PostgreSQL: pgsql/src/backend/utils/mb/Unicode/UCS_to_most.pl,v 1.7 2009/02/10 19:29:39 petere Exp $
5+
# $PostgreSQL: pgsql/src/backend/utils/mb/Unicode/UCS_to_most.pl,v 1.8 2009/03/18 16:17:28 heikki Exp $
66
#
77
# Generate UTF-8 <--> character code conversion tables from
88
# map files provided by Unicode organization.
@@ -47,7 +47,6 @@
4747
'GBK' => 'CP936.TXT',
4848
'UHC' => 'CP949.TXT',
4949
'JOHAB' => 'JOHAB.TXT',
50-
'BIG5' => 'BIG5.TXT',
5150
);
5251

5352
@charsets = keys(filename);

src/backend/utils/mb/Unicode/big5_to_utf8.map

+9-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
static pg_local_to_utf LUmapBIG5[ 13710 ] = {
1+
static pg_local_to_utf LUmapBIG5[ 13717 ] = {
22
{0xa140, 0xe38080},
33
{0xa141, 0xefbc8c},
44
{0xa142, 0xe38081},
@@ -13708,5 +13708,12 @@ static pg_local_to_utf LUmapBIG5[ 13710 ] = {
1370813708
{0xf9d2, 0xe9baa4},
1370913709
{0xf9d3, 0xe9bdbe},
1371013710
{0xf9d4, 0xe9bd89},
13711-
{0xf9d5, 0xe9be98}
13711+
{0xf9d5, 0xe9be98},
13712+
{0xf9d6, 0xe7a281},
13713+
{0xf9d7, 0xe98ab9},
13714+
{0xf9d8, 0xe8a38f},
13715+
{0xf9d9, 0xe5a2bb},
13716+
{0xf9da, 0xe68192},
13717+
{0xf9db, 0xe7b2a7},
13718+
{0xf9dc, 0xe5abba}
1371213719
};

src/backend/utils/mb/Unicode/utf8_to_big5.map

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
static pg_utf_to_local ULmapBIG5[ 13704 ] = {
1+
static pg_utf_to_local ULmapBIG5[ 13711 ] = {
22
{0xc2a2, 0xa246},
33
{0xc2a3, 0xa247},
44
{0xc2a5, 0xa244},
@@ -2140,6 +2140,7 @@ static pg_utf_to_local ULmapBIG5[ 13704 ] = {
21402140
{0xe5a2b1, 0xe54c},
21412141
{0xe5a2b3, 0xbc58},
21422142
{0xe5a2ba, 0xe94d},
2143+
{0xe5a2bb, 0xf9d9},
21432144
{0xe5a2bc, 0xe94f},
21442145
{0xe5a2bd, 0xe94a},
21452146
{0xe5a2be, 0xbec1},
@@ -2508,6 +2509,7 @@ static pg_utf_to_local ULmapBIG5[ 13704 ] = {
25082509
{0xe5abb7, 0xe557},
25092510
{0xe5abb8, 0xe55a},
25102511
{0xe5abb9, 0xe55c},
2512+
{0xe5abba, 0xf9dc},
25112513
{0xe5abbb, 0xbc5f},
25122514
{0xe5abbd, 0xe556},
25132515
{0xe5abbf, 0xe554},
@@ -3370,6 +3372,7 @@ static pg_utf_to_local ULmapBIG5[ 13704 ] = {
33703372
{0xe6818c, 0xcec9},
33713373
{0xe6818d, 0xabe9},
33723374
{0xe68190, 0xaea3},
3375+
{0xe68192, 0xf9da},
33733376
{0xe68193, 0xcec5},
33743377
{0xe68194, 0xcec1},
33753378
{0xe68195, 0xaea4},
@@ -7482,6 +7485,7 @@ static pg_utf_to_local ULmapBIG5[ 13704 ] = {
74827485
{0xe7a1be, 0xe2f0},
74837486
{0xe7a1bf, 0xb851},
74847487
{0xe7a280, 0xdef0},
7488+
{0xe7a281, 0xf9d6},
74857489
{0xe7a283, 0xdeed},
74867490
{0xe7a284, 0xdee8},
74877491
{0xe7a285, 0xdeea},
@@ -8142,6 +8146,7 @@ static pg_utf_to_local ULmapBIG5[ 13704 ] = {
81428146
{0xe7b2a2, 0xdae7},
81438147
{0xe7b2a3, 0xd6e1},
81448148
{0xe7b2a5, 0xb5b0},
8149+
{0xe7b2a7, 0xf9db},
81458150
{0xe7b2a8, 0xdae9},
81468151
{0xe7b2af, 0xdf56},
81478152
{0xe7b2b1, 0xb864},
@@ -10208,6 +10213,7 @@ static pg_utf_to_local ULmapBIG5[ 13704 ] = {
1020810213
{0xe8a38c, 0xdff8},
1020910214
{0xe8a38d, 0xdff3},
1021010215
{0xe8a38e, 0xdff4},
10216+
{0xe8a38f, 0xf9d8},
1021110217
{0xe8a390, 0xdff9},
1021210218
{0xe8a392, 0xb8cf},
1021310219
{0xe8a394, 0xb8c7},
@@ -11806,6 +11812,7 @@ static pg_utf_to_local ULmapBIG5[ 13704 ] = {
1180611812
{0xe98ab5, 0xe8a1},
1180711813
{0xe98ab6, 0xe867},
1180811814
{0xe98ab7, 0xbe50},
11815+
{0xe98ab9, 0xf9d7},
1180911816
{0xe98abb, 0xbe4f},
1181011817
{0xe98abc, 0xbe56},
1181111818
{0xe98b80, 0xe865},

0 commit comments

Comments
 (0)