24
24
* Portions Copyright (c) 1994, Regents of the University of California
25
25
*
26
26
* IDENTIFICATION
27
- * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.158 2009/09/21 22:22:07 petere Exp $
27
+ * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.159 2009/09/22 23:52:53 petere Exp $
28
28
*
29
29
*-------------------------------------------------------------------------
30
30
*/
@@ -80,6 +80,9 @@ static void addlitchar(unsigned char ychar, base_yyscan_t yyscanner);
80
80
static char *litbufdup (base_yyscan_t yyscanner);
81
81
static char *litbuf_udeescape (unsigned char escape, base_yyscan_t yyscanner);
82
82
static unsigned char unescape_single_char (unsigned char c, base_yyscan_t yyscanner);
83
+ static bool is_utf16_surrogate_first (pg_wchar c);
84
+ static bool is_utf16_surrogate_second (pg_wchar c);
85
+ static pg_wchar surrogate_pair_to_codepoint (pg_wchar first, pg_wchar second);
83
86
84
87
#define yyerror (msg ) scanner_yyerror(msg, yyscanner)
85
88
@@ -97,6 +100,8 @@ static void check_escape_warning(base_yyscan_t yyscanner);
97
100
extern int base_yyget_column (yyscan_t yyscanner);
98
101
extern void base_yyset_column (int column_no, yyscan_t yyscanner);
99
102
103
+ static void addunicode (pg_wchar c, yyscan_t yyscanner);
104
+
100
105
%}
101
106
102
107
%option reentrant
@@ -134,6 +139,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
134
139
* <xdolq> $foo$ quoted strings
135
140
* <xui> quoted identifier with Unicode escapes
136
141
* <xus> quoted string with Unicode escapes
142
+ * <xeu> Unicode surrogate pair in extended quoted string
137
143
*/
138
144
139
145
%x xb
@@ -145,6 +151,7 @@ extern void base_yyset_column(int column_no, yyscan_t yyscanner);
145
151
%x xdolq
146
152
%x xui
147
153
%x xus
154
+ %x xeu
148
155
149
156
/*
150
157
* In order to make the world safe for Windows and Mac clients as well as
@@ -223,6 +230,8 @@ xeinside [^\\']+
223
230
xeescape [\\ ][^ 0 -7 ]
224
231
xeoctesc [\\ ][0 -7 ]{1,3 }
225
232
xehexesc [\\ ]x[0 -9A -Fa -f ]{1,2 }
233
+ xeunicode [\\ ](u[0 -9A -Fa -f ]{4 }| U[0 -9A -Fa -f ]{8 })
234
+ xeunicodebad [\\ ]([uU ])
226
235
227
236
/* Extended quote
228
237
* xqdouble implements embedded quote, ''''
@@ -535,6 +544,45 @@ other .
535
544
<xe >{xeinside } {
536
545
addlit (yytext, yyleng, yyscanner);
537
546
}
547
+ <xe >{xeunicode } {
548
+ pg_wchar c = strtoul (yytext+2 , NULL , 16 );
549
+
550
+ check_escape_warning (yyscanner);
551
+
552
+ if (is_utf16_surrogate_first (c))
553
+ {
554
+ yyextra->utf16_first_part = c;
555
+ BEGIN (xeu);
556
+ }
557
+ else if (is_utf16_surrogate_second (c))
558
+ yyerror (" invalid Unicode surrogate pair" );
559
+ else
560
+ addunicode (c, yyscanner);
561
+ }
562
+ <xeu >{xeunicode } {
563
+ pg_wchar c = strtoul (yytext+2 , NULL , 16 );
564
+
565
+ if (!is_utf16_surrogate_second (c))
566
+ yyerror (" invalid Unicode surrogate pair" );
567
+
568
+ c = surrogate_pair_to_codepoint (yyextra->utf16_first_part , c);
569
+
570
+ addunicode (c, yyscanner);
571
+
572
+ BEGIN (xe);
573
+ }
574
+ <xeu >. |
575
+ <xeu >\n |
576
+ <xeu ><<EOF>> { yyerror (" invalid Unicode surrogate pair" ); }
577
+
578
+ <xe >{xeunicodebad } {
579
+ ereport (ERROR,
580
+ (errcode (ERRCODE_INVALID_ESCAPE_SEQUENCE),
581
+ errmsg (" invalid Unicode escape" ),
582
+ errhint (" Unicode escapes must be \\ uXXXX or \\ UXXXXXXXX." ),
583
+ lexer_errposition ()));
584
+ }
585
+
538
586
<xe >{xeescape } {
539
587
if (yytext[1 ] == ' \' ' )
540
588
{
@@ -1330,3 +1378,21 @@ base_yyfree(void *ptr, base_yyscan_t yyscanner)
1330
1378
if (ptr)
1331
1379
pfree (ptr);
1332
1380
}
1381
+
1382
+ static void
1383
+ addunicode (pg_wchar c, base_yyscan_t yyscanner)
1384
+ {
1385
+ char buf[8 ];
1386
+
1387
+ if (c == 0 || c > 0x10FFFF )
1388
+ yyerror (" invalid Unicode escape value" );
1389
+ if (c > 0x7F )
1390
+ {
1391
+ if (GetDatabaseEncoding () != PG_UTF8)
1392
+ yyerror (" Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8" );
1393
+ yyextra->saw_non_ascii = true ;
1394
+ }
1395
+ unicode_to_utf8 (c, (unsigned char *)buf);
1396
+ addlit (buf, pg_mblen (buf), yyscanner);
1397
+ }
1398
+
0 commit comments