Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 02faeb4

Browse files
committed
Surrogate pair support for U& string and identifier syntax
This is mainly to make the functionality consistent with the proposed \u escape syntax.
1 parent c6bc0fe commit 02faeb4

File tree

2 files changed

+81
-6
lines changed

2 files changed

+81
-6
lines changed

doc/src/sgml/syntax.sgml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.134 2009/08/27 20:08:02 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
22

33
<chapter id="sql-syntax">
44
<title>SQL Syntax</title>
@@ -238,6 +238,10 @@ U&amp;"d!0061t!+000061" UESCAPE '!'
238238
The Unicode escape syntax works only when the server encoding is
239239
UTF8. When other server encodings are used, only code points in
240240
the ASCII range (up to <literal>\007F</literal>) can be specified.
241+
Both the 4-digit and the 6-digit form can be used to specify
242+
UTF-16 surrogate pairs to compose characters with code points
243+
larger than <literal>\FFFF</literal> (although the availability of
244+
the 6-digit form technically makes this unnecessary).
241245
</para>
242246

243247
<para>
@@ -497,6 +501,10 @@ U&amp;'d!0061t!+000061' UESCAPE '!'
497501
UTF8. When other server encodings are used, only code points in
498502
the ASCII range (up to <literal>\007F</literal>) can be
499503
specified.
504+
Both the 4-digit and the 6-digit form can be used to specify
505+
UTF-16 surrogate pairs to compose characters with code points
506+
larger than <literal>\FFFF</literal> (although the availability
507+
of the 6-digit form technically makes this unnecessary).
500508
</para>
501509

502510
<para>

src/backend/parser/scan.l

Lines changed: 72 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* Portions Copyright (c) 1994, Regents of the University of California
2525
*
2626
* IDENTIFICATION
27-
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.157 2009/07/14 20:24:10 tgl Exp $
27+
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
2828
*
2929
*-------------------------------------------------------------------------
3030
*/
@@ -1097,11 +1097,30 @@ check_unicode_value(pg_wchar c, char *loc, base_yyscan_t yyscanner)
10971097
}
10981098
}
10991099

1100+
static bool
1101+
is_utf16_surrogate_first(pg_wchar c)
1102+
{
1103+
return (c >= 0xD800 && c <= 0xDBFF);
1104+
}
1105+
1106+
static bool
1107+
is_utf16_surrogate_second(pg_wchar c)
1108+
{
1109+
return (c >= 0xDC00 && c <= 0xDFFF);
1110+
}
1111+
1112+
static pg_wchar
1113+
surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1114+
{
1115+
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1116+
}
1117+
11001118
static char *
11011119
litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11021120
{
11031121
char *new;
11041122
char *litbuf, *in, *out;
1123+
pg_wchar pair_first = 0;
11051124

11061125
if (isxdigit(escape)
11071126
|| escape == '+'
@@ -1131,16 +1150,39 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11311150
{
11321151
if (in[1] == escape)
11331152
{
1153+
if (pair_first)
1154+
{
1155+
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1156+
yyerror("invalid Unicode surrogate pair");
1157+
}
11341158
*out++ = escape;
11351159
in += 2;
11361160
}
11371161
else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4]))
11381162
{
11391163
pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]);
11401164
check_unicode_value(unicode, in, yyscanner);
1141-
unicode_to_utf8(unicode, (unsigned char *) out);
1165+
if (pair_first)
1166+
{
1167+
if (is_utf16_surrogate_second(unicode))
1168+
{
1169+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1170+
pair_first = 0;
1171+
}
1172+
else
1173+
{
1174+
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1175+
yyerror("invalid Unicode surrogate pair");
1176+
}
1177+
}
1178+
if (is_utf16_surrogate_first(unicode))
1179+
pair_first = unicode;
1180+
else
1181+
{
1182+
unicode_to_utf8(unicode, (unsigned char *) out);
1183+
out += pg_mblen(out);
1184+
}
11421185
in += 5;
1143-
out += pg_mblen(out);
11441186
}
11451187
else if (in[1] == '+'
11461188
&& isxdigit(in[2]) && isxdigit(in[3])
@@ -1150,9 +1192,27 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11501192
pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16
11511193
+ hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]);
11521194
check_unicode_value(unicode, in, yyscanner);
1153-
unicode_to_utf8(unicode, (unsigned char *) out);
1195+
if (pair_first)
1196+
{
1197+
if (is_utf16_surrogate_second(unicode))
1198+
{
1199+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1200+
pair_first = 0;
1201+
}
1202+
else
1203+
{
1204+
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1205+
yyerror("invalid Unicode surrogate pair");
1206+
}
1207+
}
1208+
if (is_utf16_surrogate_first(unicode))
1209+
pair_first = unicode;
1210+
else
1211+
{
1212+
unicode_to_utf8(unicode, (unsigned char *) out);
1213+
out += pg_mblen(out);
1214+
}
11541215
in += 8;
1155-
out += pg_mblen(out);
11561216
}
11571217
else
11581218
{
@@ -1161,7 +1221,14 @@ litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner)
11611221
}
11621222
}
11631223
else
1224+
{
1225+
if (pair_first)
1226+
{
1227+
ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1228+
yyerror("invalid Unicode surrogate pair");
1229+
}
11641230
*out++ = *in++;
1231+
}
11651232
}
11661233

11671234
*out = '\0';

0 commit comments

Comments
 (0)