33
33
* Portions Copyright (c) 1994, Regents of the University of California
34
34
*
35
35
* IDENTIFICATION
36
- * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.28 2009/01/01 17:23:55 momjian Exp $
36
+ * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.29 2009/09/27 03:27:24 tgl Exp $
37
37
*
38
38
*-------------------------------------------------------------------------
39
39
*/
@@ -117,6 +117,7 @@ static void push_new_buffer(const char *newstr);
117
117
static YY_BUFFER_STATE prepare_buffer (const char *txt, int len,
118
118
char **txtcopy);
119
119
static void emit (const char *txt, int len);
120
+ static bool is_utf16_surrogate_first (uint32 c);
120
121
121
122
#define ECHO emit (yytext, yyleng)
122
123
@@ -158,6 +159,7 @@ static void emit(const char *txt, int len);
158
159
* <xdolq> $foo$ quoted strings
159
160
* <xui> quoted identifier with Unicode escapes
160
161
* <xus> quoted string with Unicode escapes
162
+ * <xeu> Unicode surrogate pair in extended quoted string
161
163
*/
162
164
163
165
%x xb
@@ -169,6 +171,7 @@ static void emit(const char *txt, int len);
169
171
%x xdolq
170
172
%x xui
171
173
%x xus
174
+ %x xeu
172
175
/* Additional exclusive states for psql only: lex backslash commands */
173
176
%x xslashcmd
174
177
%x xslasharg
@@ -192,6 +195,9 @@ static void emit(const char *txt, int len);
192
195
* did not end with a newline.
193
196
*
194
197
* XXX perhaps \f (formfeed) should be treated as a newline as well?
198
+ *
199
+ * XXX if you change the set of whitespace characters, fix scanner_isspace()
200
+ * to agree, and see also the plpgsql lexer.
195
201
*/
196
202
197
203
space [ \t\n\r\f]
@@ -253,6 +259,8 @@ xeinside [^\\']+
253
259
xeescape [\\ ][^0-7]
254
260
xeoctesc [\\ ][0-7]{1,3}
255
261
xehexesc [\\ ]x[0-9A-Fa-f]{1,2}
262
+ xeunicode [\\ ](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
263
+ xeunicodefail [\\ ](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
256
264
257
265
/* Extended quote
258
266
* xqdouble implements embedded quote, ' ' ' '
@@ -334,6 +342,10 @@ identifier {ident_start}{ident_cont}*
334
342
335
343
typecast " ::"
336
344
345
+ /* these two token types are used by PL/pgsql, though not in core SQL */
346
+ dot_dot \.\.
347
+ colon_equals " :="
348
+
337
349
/*
338
350
* "self" is the set of chars that should be returned as single-character
339
351
* tokens. "op_chars" is the set of chars that can make up "Op" tokens,
@@ -511,6 +523,22 @@ other .
511
523
<xe>{xeinside} {
512
524
ECHO;
513
525
}
526
+ <xe>{xeunicode} {
527
+ uint32 c = strtoul (yytext+2 , NULL , 16 );
528
+
529
+ if (is_utf16_surrogate_first (c))
530
+ BEGIN (xeu);
531
+ ECHO;
532
+ }
533
+ <xeu>{xeunicode} {
534
+ BEGIN (xe);
535
+ ECHO;
536
+ }
537
+ <xeu>. { ECHO; }
538
+ <xeu>\n { ECHO; }
539
+ <xe,xeu>{xeunicodefail} {
540
+ ECHO;
541
+ }
514
542
<xe>{xeescape} {
515
543
ECHO;
516
544
}
@@ -605,6 +633,14 @@ other .
605
633
ECHO;
606
634
}
607
635
636
+ {dot_dot} {
637
+ ECHO;
638
+ }
639
+
640
+ {colon_equals} {
641
+ ECHO;
642
+ }
643
+
608
644
/*
609
645
* These rules are specific to psql --- they implement parenthesis
610
646
* counting and detection of command-ending semicolon. These must
@@ -1690,3 +1726,9 @@ emit(const char *txt, int len)
1690
1726
}
1691
1727
}
1692
1728
}
1729
+
1730
+ static bool
1731
+ is_utf16_surrogate_first (uint32 c)
1732
+ {
1733
+ return (c >= 0xD800 && c <= 0xDBFF );
1734
+ }
0 commit comments