@@ -32,6 +32,7 @@ static void addchar(bool init, char s);
32
32
static int checkSpecialVal (void ); /* examine scanstring for the special value */
33
33
34
34
static void parseUnicode (char *s, int l);
35
+ static void parseHexChars (char *s, int l);
35
36
36
37
/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
37
38
#undef fprintf
@@ -62,12 +63,16 @@ fprintf_to_ereport(const char *fmt, const char *msg)
62
63
%x xQUOTED
63
64
%x xNONQUOTED
64
65
%x xVARQUOTED
66
+ %x xSINGLEQUOTED
65
67
%x xCOMMENT
66
68
67
69
special [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
68
- any [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f ]
70
+ any [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \' \t\n\r\f ]
69
71
blank [ \t\n\r\f ]
70
- unicode \\ u[0-9A-Fa-f]{4}
72
+ hex_dig [0-9A-Fa-f]
73
+ unicode \\ u({hex_dig}{4}|\{{hex_dig}{1,6}\})
74
+ hex_char \\ x{hex_dig}{2}
75
+
71
76
72
77
%%
73
78
@@ -152,6 +157,11 @@ unicode \\u[0-9A-Fa-f]{4}
152
157
BEGIN xQUOTED;
153
158
}
154
159
160
+ <INITIAL>\' {
161
+ addchar(true, '\0 ');
162
+ BEGIN xSINGLEQUOTED;
163
+ }
164
+
155
165
<INITIAL>\\ {
156
166
yyless(0);
157
167
addchar(true, '\0 ');
@@ -174,7 +184,7 @@ unicode \\u[0-9A-Fa-f]{4}
174
184
BEGIN xCOMMENT;
175
185
}
176
186
177
- <xNONQUOTED>({special}|\" ) {
187
+ <xNONQUOTED>({special}|\" | \' ) {
178
188
yylval->str = scanstring;
179
189
yyless(0);
180
190
BEGIN INITIAL;
@@ -187,41 +197,56 @@ unicode \\u[0-9A-Fa-f]{4}
187
197
return checkSpecialVal();
188
198
}
189
199
190
- <xNONQUOTED,xQUOTED,xVARQUOTED>\\ [\"\\ ] { addchar(false, yytext[1]); }
200
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\ [\"\'\\ ] { addchar(false, yytext[1]); }
201
+
202
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\ b { addchar(false, '\b '); }
203
+
204
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\ f { addchar(false, '\f '); }
191
205
192
- <xNONQUOTED,xQUOTED,xVARQUOTED>\\ b { addchar(false, '\b '); }
206
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED >\\ n { addchar(false, '\n '); }
193
207
194
- <xNONQUOTED,xQUOTED,xVARQUOTED>\\ f { addchar(false, '\f '); }
208
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED >\\ r { addchar(false, '\r '); }
195
209
196
- <xNONQUOTED,xQUOTED,xVARQUOTED>\\ n { addchar(false, '\n '); }
210
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED >\\ t { addchar(false, '\t '); }
197
211
198
- <xNONQUOTED,xQUOTED,xVARQUOTED>\\ r { addchar(false, '\r '); }
212
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED >\\ v { addchar(false, '\v '); }
199
213
200
- <xNONQUOTED,xQUOTED,xVARQUOTED> \\ t { addchar(false, ' \t ' ); }
214
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{unicode}+ { parseUnicode(yytext, yyleng ); }
201
215
202
- <xNONQUOTED,xQUOTED,xVARQUOTED>{unicode }+ { parseUnicode (yytext, yyleng); }
216
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{hex_char }+ { parseHexChars (yytext, yyleng); }
203
217
204
- <xNONQUOTED,xQUOTED,xVARQUOTED>\\ u { yyerror(NULL, " Unicode sequence is invalid" ); }
218
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED >\\ x { yyerror(NULL, " Hex character sequence is invalid" ); }
205
219
206
- <xNONQUOTED,xQUOTED,xVARQUOTED>\\ . { yyerror(NULL, " Escape sequence is invalid" ); }
220
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED >\\ u { yyerror(NULL, " Unicode sequence is invalid" ); }
207
221
208
- <xNONQUOTED,xQUOTED,xVARQUOTED>\\ { yyerror(NULL, " Unexpected end after backslash " ); }
222
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED >\\ . { yyerror(NULL, " Escape sequence is invalid " ); }
209
223
210
- <xQUOTED,xVARQUOTED><<EOF>> { yyerror(NULL, " Unexpected end of quoted string" ); }
224
+ <xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\ { yyerror(NULL, " Unexpected end after backslash" ); }
225
+
226
+ <xQUOTED,xVARQUOTED,xSINGLEQUOTED><<EOF>> { yyerror(NULL, " Unexpected end of quoted string" ); }
211
227
212
228
<xQUOTED>\" {
213
229
yylval->str = scanstring;
214
230
BEGIN INITIAL;
215
231
return STRING_P;
216
232
}
217
- <xVARQUOTED>\" {
233
+
234
+ <xVARQUOTED>\" {
218
235
yylval->str = scanstring;
219
236
BEGIN INITIAL;
220
237
return VARIABLE_P;
221
238
}
222
239
240
+ <xSINGLEQUOTED>\' {
241
+ yylval->str = scanstring;
242
+ BEGIN INITIAL;
243
+ return STRING_P;
244
+ }
245
+
223
246
<xQUOTED,xVARQUOTED>[^\\\" ]+ { addstring(false, yytext, yyleng); }
224
247
248
+ <xSINGLEQUOTED>[^\\\' ]+ { addstring(false, yytext, yyleng); }
249
+
225
250
<INITIAL><<EOF>> { yyterminate(); }
226
251
227
252
<xCOMMENT>\*\/ { BEGIN INITIAL; }
@@ -436,95 +461,136 @@ hexval(char c)
436
461
return 0 ; /* not reached */
437
462
}
438
463
464
+ static void
465
+ addUnicodeChar (int ch)
466
+ {
467
+ /*
468
+ * For UTF8, replace the escape sequence by the actual
469
+ * utf8 character in lex->strval. Do this also for other
470
+ * encodings if the escape designates an ASCII character,
471
+ * otherwise raise an error.
472
+ */
473
+
474
+ if (ch == 0 )
475
+ {
476
+ /* We can't allow this, since our TEXT type doesn't */
477
+ ereport (ERROR,
478
+ (errcode (ERRCODE_UNTRANSLATABLE_CHARACTER),
479
+ errmsg (" unsupported Unicode escape sequence" ),
480
+ errdetail (" \\ u0000 cannot be converted to text." )));
481
+ }
482
+ else if (GetDatabaseEncoding () == PG_UTF8)
483
+ {
484
+ char utf8str[5 ];
485
+ int utf8len;
486
+
487
+ unicode_to_utf8 (ch, (unsigned char *) utf8str);
488
+ utf8len = pg_utf_mblen ((unsigned char *) utf8str);
489
+ addstring (false , utf8str, utf8len);
490
+ }
491
+ else if (ch <= 0x007f )
492
+ {
493
+ /*
494
+ * This is the only way to designate things like a
495
+ * form feed character in JSON, so it's useful in all
496
+ * encodings.
497
+ */
498
+ addchar (false , (char ) ch);
499
+ }
500
+ else
501
+ {
502
+ ereport (ERROR,
503
+ (errcode (ERRCODE_INVALID_TEXT_REPRESENTATION),
504
+ errmsg (" invalid input syntax for type jsonpath" ),
505
+ errdetail (" Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8." )));
506
+ }
507
+ }
508
+
509
+ static void
510
+ addUnicode (int ch, int *hi_surrogate)
511
+ {
512
+ if (ch >= 0xd800 && ch <= 0xdbff )
513
+ {
514
+ if (*hi_surrogate != -1 )
515
+ ereport (ERROR,
516
+ (errcode (ERRCODE_INVALID_TEXT_REPRESENTATION),
517
+ errmsg (" invalid input syntax for type jsonpath" ),
518
+ errdetail (" Unicode high surrogate must not follow a high surrogate." )));
519
+ *hi_surrogate = (ch & 0x3ff ) << 10 ;
520
+ return ;
521
+ }
522
+ else if (ch >= 0xdc00 && ch <= 0xdfff )
523
+ {
524
+ if (*hi_surrogate == -1 )
525
+ ereport (ERROR,
526
+ (errcode (ERRCODE_INVALID_TEXT_REPRESENTATION),
527
+ errmsg (" invalid input syntax for type jsonpath" ),
528
+ errdetail (" Unicode low surrogate must follow a high surrogate." )));
529
+ ch = 0x10000 + *hi_surrogate + (ch & 0x3ff );
530
+ *hi_surrogate = -1 ;
531
+ }
532
+ else if (*hi_surrogate != -1 )
533
+ {
534
+ ereport (ERROR,
535
+ (errcode (ERRCODE_INVALID_TEXT_REPRESENTATION),
536
+ errmsg (" invalid input syntax for type jsonpath" ),
537
+ errdetail (" Unicode low surrogate must follow a high surrogate." )));
538
+ }
539
+
540
+ addUnicodeChar (ch);
541
+ }
542
+
439
543
/*
440
544
* parseUnicode was adopted from json_lex_string() in
441
545
* src/backend/utils/adt/json.c
442
546
*/
443
547
static void
444
548
parseUnicode (char *s, int l)
445
549
{
446
- int i, j;
447
- int ch = 0 ;
448
- int hi_surrogate = -1 ;
449
-
450
- Assert (l % 6 /* \uXXXX */ == 0 );
550
+ int i;
551
+ int hi_surrogate = -1 ;
451
552
452
- for (i = 0 ; i < l / 6 ; i++)
553
+ for (i = 2 ; i < l; i += 2 ) /* skip '\u' */
453
554
{
454
- ch = 0 ;
455
-
456
- for (j=0 ; j<4 ; j++)
457
- ch = (ch << 4 ) | hexval (s[ i*6 + 2 + j]);
555
+ int ch = 0 ;
556
+ int j;
458
557
459
- if (ch >= 0xd800 && ch <= 0xdbff )
558
+ if (s[i] == ' { ' ) /* parse '\u{XX...}' */
460
559
{
461
- if (hi_surrogate != -1 )
462
- ereport (ERROR,
463
- (errcode (ERRCODE_INVALID_TEXT_REPRESENTATION),
464
- errmsg (" invalid input syntax for type jsonpath" ),
465
- errdetail (" Unicode high surrogate must not follow a high surrogate." )));
466
- hi_surrogate = (ch & 0x3ff ) << 10 ;
467
- continue ;
560
+ while (s[++i] != ' }' && i < l)
561
+ ch = (ch << 4 ) | hexval (s[i]);
562
+ i++; /* ski p '}' */
468
563
}
469
- else if (ch >= 0xdc00 && ch <= 0xdfff )
564
+ else /* parse '\uXXXX' */
470
565
{
471
- if (hi_surrogate == -1 )
472
- ereport (ERROR,
473
- (errcode (ERRCODE_INVALID_TEXT_REPRESENTATION),
474
- errmsg (" invalid input syntax for type jsonpath" ),
475
- errdetail (" Unicode low surrogate must follow a high surrogate." )));
476
- ch = 0x10000 + hi_surrogate + (ch & 0x3ff );
477
- hi_surrogate = -1 ;
566
+ for (j = 0 ; j < 4 && i < l; j++)
567
+ ch = (ch << 4 ) | hexval (s[i++]);
478
568
}
479
569
480
- if (hi_surrogate != -1 )
481
- ereport (ERROR,
482
- (errcode (ERRCODE_INVALID_TEXT_REPRESENTATION),
483
- errmsg (" invalid input syntax for type jsonpath" ),
484
- errdetail (" Unicode low surrogate must follow a high surrogate." )));
570
+ addUnicode (ch, &hi_surrogate);
571
+ }
485
572
486
- /*
487
- * For UTF8, replace the escape sequence by the actual
488
- * utf8 character in lex->strval. Do this also for other
489
- * encodings if the escape designates an ASCII character,
490
- * otherwise raise an error.
491
- */
573
+ if (hi_surrogate != -1 )
574
+ {
575
+ ereport (ERROR,
576
+ (errcode (ERRCODE_INVALID_TEXT_REPRESENTATION),
577
+ errmsg (" invalid input syntax for type jsonpath" ),
578
+ errdetail (" Unicode low surrogate must follow a high surrogate." )));
579
+ }
580
+ }
492
581
493
- if (ch == 0 )
494
- {
495
- /* We can't allow this, since our TEXT type doesn't */
496
- ereport (ERROR,
497
- (errcode (ERRCODE_UNTRANSLATABLE_CHARACTER),
498
- errmsg (" unsupported Unicode escape sequence" ),
499
- errdetail (" \\ u0000 cannot be converted to text." )));
500
- }
501
- else if (GetDatabaseEncoding () == PG_UTF8)
502
- {
503
- char utf8str[5 ];
504
- int utf8len;
582
+ static void
583
+ parseHexChars (char *s, int l)
584
+ {
585
+ int i;
505
586
506
- unicode_to_utf8 (ch, (unsigned char *) utf8str);
507
- utf8len = pg_utf_mblen ((unsigned char *) utf8str);
508
- addstring (false , utf8str, utf8len);
509
- }
510
- else if (ch <= 0x007f )
511
- {
512
- /*
513
- * This is the only way to designate things like a
514
- * form feed character in JSON, so it's useful in all
515
- * encodings.
516
- */
517
- addchar (false , (char ) ch);
518
- }
519
- else
520
- {
521
- ereport (ERROR,
522
- (errcode (ERRCODE_INVALID_TEXT_REPRESENTATION),
523
- errmsg (" invalid input syntax for type jsonpath" ),
524
- errdetail (" Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8." )));
525
- }
587
+ Assert (l % 4 /* \xXX */ == 0 );
588
+
589
+ for (i = 0 ; i < l / 4 ; i++)
590
+ {
591
+ int ch = (hexval (s[i * 4 + 2 ]) << 4 ) | hexval (s[i * 4 + 3 ]);
526
592
527
- hi_surrogate = - 1 ;
593
+ addUnicodeChar (ch) ;
528
594
}
529
595
}
530
596
0 commit comments