Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c2bb037

Browse files
committed
Unicode escapes in E'...' strings
Author: Marko Kreen <markokr@gmail.com>
1 parent 9048b73 commit c2bb037

File tree

3 files changed

+98
-9
lines changed

3 files changed

+98
-9
lines changed

doc/src/sgml/syntax.sgml

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.135 2009/09/21 22:22:07 petere Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/syntax.sgml,v 1.136 2009/09/22 23:52:53 petere Exp $ -->
22

33
<chapter id="sql-syntax">
44
<title>SQL Syntax</title>
@@ -398,6 +398,14 @@ SELECT 'foo' 'bar';
398398
</entry>
399399
<entry>hexadecimal byte value</entry>
400400
</row>
401+
<row>
402+
<entry>
403+
<literal>\u<replaceable>xxxx</replaceable></literal>,
404+
<literal>\U<replaceable>xxxxxxxx</replaceable></literal>
405+
(<replaceable>x</replaceable> = 0 - 9, A - F)
406+
</entry>
407+
<entry>16 or 32-bit hexadecimal Unicode character value</entry>
408+
</row>
401409
</tbody>
402410
</tgroup>
403411
</table>
@@ -411,13 +419,25 @@ SELECT 'foo' 'bar';
411419
</para>
412420

413421
<para>
414-
It is your responsibility that the byte sequences you create are
422+
It is your responsibility that the byte sequences you create,
423+
especially when using the octal or hexadecimal escapes, compose
415424
valid characters in the server character set encoding. When the
416-
server encoding is UTF-8, then the alternative Unicode escape
417-
syntax, explained in <xref linkend="sql-syntax-strings-uescape">,
418-
should be used instead. (The alternative would be doing the
419-
UTF-8 encoding by hand and writing out the bytes, which would be
420-
very cumbersome.)
425+
server encoding is UTF-8, then the Unicode escapes or the
426+
alternative Unicode escape syntax, explained
427+
in <xref linkend="sql-syntax-strings-uescape">, should be used
428+
instead. (The alternative would be doing the UTF-8 encoding by
429+
hand and writing out the bytes, which would be very cumbersome.)
430+
</para>
431+
432+
<para>
433+
The Unicode escape syntax works fully only when the server
434+
encoding is UTF-8. When other server encodings are used, only
435+
code points in the ASCII range (up to <literal>\u007F</>) can be
436+
specified. Both the 4-digit and the 8-digit form can be used to
437+
specify UTF-16 surrogate pairs to compose characters with code
438+
points larger than <literal>\FFFF</literal> (although the
439+
availability of the 8-digit form technically makes this
440+
unnecessary).
421441
</para>
422442

423443
<caution>

src/backend/parser/scan.l

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* Portions Copyright (c) 1994, Regents of the University of California
2525
*
2626
* IDENTIFICATION
27-
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
27+
* $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
2828
*
2929
*-------------------------------------------------------------------------
3030
*/
@@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner);
8080
static char *litbufdup(base_yyscan_t yyscanner);
8181
static char *litbuf_udeescape(unsigned char escape, base_yyscan_t yyscanner);
8282
static unsigned char unescape_single_char(unsigned char c, base_yyscan_t yyscanner);
83+
static bool is_utf16_surrogate_first(pg_wchar c);
84+
static bool is_utf16_surrogate_second(pg_wchar c);
85+
static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
8386

8487
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
8588

@@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
97100
extern int base_yyget_column(yyscan_t yyscanner);
98101
extern void base_yyset_column(int column_no, yyscan_t yyscanner);
99102

103+
static void addunicode(pg_wchar c, yyscan_t yyscanner);
104+
100105
%}
101106

102107
%option reentrant
@@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
134139
* <xdolq> $foo$ quoted strings
135140
* <xui> quoted identifier with Unicode escapes
136141
* <xus> quoted string with Unicode escapes
142+
* <xeu> Unicode surrogate pair in extended quoted string
137143
*/
138144

139145
%x xb
@@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
145151
%x xdolq
146152
%x xui
147153
%x xus
154+
%x xeu
148155

149156
/*
150157
* In order to make the world safe for Windows and Mac clients as well as
@@ -223,6 +230,8 @@ xeinside [^\\']+
223230
xeescape [\\][^0-7]
224231
xeoctesc [\\][0-7]{1,3}
225232
xehexesc [\\]x[0-9A-Fa-f]{1,2}
233+
xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
234+
xeunicodebad [\\]([uU])
226235

227236
/* Extended quote
228237
* xqdouble implements embedded quote, ''''
@@ -535,6 +544,45 @@ other .
535544
<xe>{xeinside} {
536545
addlit(yytext, yyleng, yyscanner);
537546
}
547+
<xe>{xeunicode} {
548+
pg_wchar c = strtoul(yytext+2, NULL, 16);
549+
550+
check_escape_warning(yyscanner);
551+
552+
if (is_utf16_surrogate_first(c))
553+
{
554+
yyextra->utf16_first_part = c;
555+
BEGIN(xeu);
556+
}
557+
else if (is_utf16_surrogate_second(c))
558+
yyerror("invalid Unicode surrogate pair");
559+
else
560+
addunicode(c, yyscanner);
561+
}
562+
<xeu>{xeunicode} {
563+
pg_wchar c = strtoul(yytext+2, NULL, 16);
564+
565+
if (!is_utf16_surrogate_second(c))
566+
yyerror("invalid Unicode surrogate pair");
567+
568+
c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
569+
570+
addunicode(c, yyscanner);
571+
572+
BEGIN(xe);
573+
}
574+
<xeu>. |
575+
<xeu>\n |
576+
<xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
577+
578+
<xe>{xeunicodebad} {
579+
ereport(ERROR,
580+
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
581+
errmsg("invalid Unicode escape"),
582+
errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
583+
lexer_errposition()));
584+
}
585+
538586
<xe>{xeescape} {
539587
if (yytext[1] == '\'')
540588
{
@@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
13301378
if (ptr)
13311379
pfree(ptr);
13321380
}
1381+
1382+
static void
1383+
addunicode(pg_wchar c, base_yyscan_t yyscanner)
1384+
{
1385+
char buf[8];
1386+
1387+
if (c == 0 || c > 0x10FFFF)
1388+
yyerror("invalid Unicode escape value");
1389+
if (c > 0x7F)
1390+
{
1391+
if (GetDatabaseEncoding() != PG_UTF8)
1392+
yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1393+
yyextra->saw_non_ascii = true;
1394+
}
1395+
unicode_to_utf8(c, (unsigned char *)buf);
1396+
addlit(buf, pg_mblen(buf), yyscanner);
1397+
}
1398+

src/include/parser/gramparse.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
1212
* Portions Copyright (c) 1994, Regents of the University of California
1313
*
14-
* $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.47 2009/07/14 20:24:10 tgl Exp $
14+
* $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.48 2009/09/22 23:52:53 petere Exp $
1515
*
1616
*-------------------------------------------------------------------------
1717
*/
@@ -71,6 +71,9 @@ typedef struct base_yy_extra_type
7171
int xcdepth; /* depth of nesting in slash-star comments */
7272
char *dolqstart; /* current $foo$ quote start string */
7373

74+
/* first part of UTF16 surrogate pair for Unicode escapes */
75+
int32 utf16_first_part;
76+
7477
/* state variables for literal-lexing warnings */
7578
bool warn_on_first_escape;
7679
bool saw_non_ascii;

0 commit comments

Comments
 (0)