Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 08294cb

Browse files
author
Nikita Glukhov
committed
Fix jsonpath escaping
1 parent d680b24 commit 08294cb

File tree

3 files changed

+187
-85
lines changed

3 files changed

+187
-85
lines changed

src/backend/utils/adt/jsonpath_scan.l

Lines changed: 151 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ static void addchar(bool init, char s);
3232
static int checkSpecialVal(void); /* examine scanstring for the special value */
3333

3434
static void parseUnicode(char *s, int l);
35+
static void parseHexChars(char *s, int l);
3536

3637
/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
3738
#undef fprintf
@@ -62,12 +63,16 @@ fprintf_to_ereport(const char *fmt, const char *msg)
6263
%x xQUOTED
6364
%x xNONQUOTED
6465
%x xVARQUOTED
66+
%x xSINGLEQUOTED
6567
%x xCOMMENT
6668

6769
special [\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/]
68-
any [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\" \t\n\r\f]
70+
any [^\?\%\$\.\[\]\{\}\(\)\|\&\!\=\<\>\@\#\,\*:\-\+\/\\\"\' \t\n\r\f]
6971
blank [ \t\n\r\f]
70-
unicode \\u[0-9A-Fa-f]{4}
72+
hex_dig [0-9A-Fa-f]
73+
unicode \\u({hex_dig}{4}|\{{hex_dig}{1,6}\})
74+
hex_char \\x{hex_dig}{2}
75+
7176
7277
%%
7378
@@ -152,6 +157,11 @@ unicode \\u[0-9A-Fa-f]{4}
152157
BEGIN xQUOTED;
153158
}
154159
160+
<INITIAL>\' {
161+
addchar(true, '\0');
162+
BEGIN xSINGLEQUOTED;
163+
}
164+
155165
<INITIAL>\\ {
156166
yyless(0);
157167
addchar(true, '\0');
@@ -174,7 +184,7 @@ unicode \\u[0-9A-Fa-f]{4}
174184
BEGIN xCOMMENT;
175185
}
176186
177-
<xNONQUOTED>({special}|\") {
187+
<xNONQUOTED>({special}|\"|\') {
178188
yylval->str = scanstring;
179189
yyless(0);
180190
BEGIN INITIAL;
@@ -187,41 +197,56 @@ unicode \\u[0-9A-Fa-f]{4}
187197
return checkSpecialVal();
188198
}
189199
190-
<xNONQUOTED,xQUOTED,xVARQUOTED>\\[\"\\] { addchar(false, yytext[1]); }
200+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\[\"\'\\] { addchar(false, yytext[1]); }
201+
202+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\b { addchar(false, '\b'); }
203+
204+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\f { addchar(false, '\f'); }
191205
192-
<xNONQUOTED,xQUOTED,xVARQUOTED>\\b { addchar(false, '\b'); }
206+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\n { addchar(false, '\n'); }
193207
194-
<xNONQUOTED,xQUOTED,xVARQUOTED>\\f { addchar(false, '\f'); }
208+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\r { addchar(false, '\r'); }
195209
196-
<xNONQUOTED,xQUOTED,xVARQUOTED>\\n { addchar(false, '\n'); }
210+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\t { addchar(false, '\t'); }
197211
198-
<xNONQUOTED,xQUOTED,xVARQUOTED>\\r { addchar(false, '\r'); }
212+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\v { addchar(false, '\v'); }
199213
200-
<xNONQUOTED,xQUOTED,xVARQUOTED>\\t { addchar(false, '\t'); }
214+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{unicode}+ { parseUnicode(yytext, yyleng); }
201215
202-
<xNONQUOTED,xQUOTED,xVARQUOTED>{unicode}+ { parseUnicode(yytext, yyleng); }
216+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>{hex_char}+ { parseHexChars(yytext, yyleng); }
203217
204-
<xNONQUOTED,xQUOTED,xVARQUOTED>\\u { yyerror(NULL, "Unicode sequence is invalid"); }
218+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\x { yyerror(NULL, "Hex character sequence is invalid"); }
205219
206-
<xNONQUOTED,xQUOTED,xVARQUOTED>\\. { yyerror(NULL, "Escape sequence is invalid"); }
220+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\u { yyerror(NULL, "Unicode sequence is invalid"); }
207221
208-
<xNONQUOTED,xQUOTED,xVARQUOTED>\\ { yyerror(NULL, "Unexpected end after backslash"); }
222+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\. { yyerror(NULL, "Escape sequence is invalid"); }
209223
210-
<xQUOTED,xVARQUOTED><<EOF>> { yyerror(NULL, "Unexpected end of quoted string"); }
224+
<xNONQUOTED,xQUOTED,xVARQUOTED,xSINGLEQUOTED>\\ { yyerror(NULL, "Unexpected end after backslash"); }
225+
226+
<xQUOTED,xVARQUOTED,xSINGLEQUOTED><<EOF>> { yyerror(NULL, "Unexpected end of quoted string"); }
211227
212228
<xQUOTED>\" {
213229
yylval->str = scanstring;
214230
BEGIN INITIAL;
215231
return STRING_P;
216232
}
217-
<xVARQUOTED>\" {
233+
234+
<xVARQUOTED>\" {
218235
yylval->str = scanstring;
219236
BEGIN INITIAL;
220237
return VARIABLE_P;
221238
}
222239
240+
<xSINGLEQUOTED>\' {
241+
yylval->str = scanstring;
242+
BEGIN INITIAL;
243+
return STRING_P;
244+
}
245+
223246
<xQUOTED,xVARQUOTED>[^\\\"]+ { addstring(false, yytext, yyleng); }
224247
248+
<xSINGLEQUOTED>[^\\\']+ { addstring(false, yytext, yyleng); }
249+
225250
<INITIAL><<EOF>> { yyterminate(); }
226251
227252
<xCOMMENT>\*\/ { BEGIN INITIAL; }
@@ -436,95 +461,136 @@ hexval(char c)
436461
return 0; /* not reached */
437462
}
438463

464+
static void
465+
addUnicodeChar(int ch)
466+
{
467+
/*
468+
* For UTF8, replace the escape sequence by the actual
469+
* utf8 character in lex->strval. Do this also for other
470+
* encodings if the escape designates an ASCII character,
471+
* otherwise raise an error.
472+
*/
473+
474+
if (ch == 0)
475+
{
476+
/* We can't allow this, since our TEXT type doesn't */
477+
ereport(ERROR,
478+
(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
479+
errmsg("unsupported Unicode escape sequence"),
480+
errdetail("\\u0000 cannot be converted to text.")));
481+
}
482+
else if (GetDatabaseEncoding() == PG_UTF8)
483+
{
484+
char utf8str[5];
485+
int utf8len;
486+
487+
unicode_to_utf8(ch, (unsigned char *) utf8str);
488+
utf8len = pg_utf_mblen((unsigned char *) utf8str);
489+
addstring(false, utf8str, utf8len);
490+
}
491+
else if (ch <= 0x007f)
492+
{
493+
/*
494+
* This is the only way to designate things like a
495+
* form feed character in JSON, so it's useful in all
496+
* encodings.
497+
*/
498+
addchar(false, (char) ch);
499+
}
500+
else
501+
{
502+
ereport(ERROR,
503+
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
504+
errmsg("invalid input syntax for type jsonpath"),
505+
errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.")));
506+
}
507+
}
508+
509+
static void
510+
addUnicode(int ch, int *hi_surrogate)
511+
{
512+
if (ch >= 0xd800 && ch <= 0xdbff)
513+
{
514+
if (*hi_surrogate != -1)
515+
ereport(ERROR,
516+
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
517+
errmsg("invalid input syntax for type jsonpath"),
518+
errdetail("Unicode high surrogate must not follow a high surrogate.")));
519+
*hi_surrogate = (ch & 0x3ff) << 10;
520+
return;
521+
}
522+
else if (ch >= 0xdc00 && ch <= 0xdfff)
523+
{
524+
if (*hi_surrogate == -1)
525+
ereport(ERROR,
526+
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
527+
errmsg("invalid input syntax for type jsonpath"),
528+
errdetail("Unicode low surrogate must follow a high surrogate.")));
529+
ch = 0x10000 + *hi_surrogate + (ch & 0x3ff);
530+
*hi_surrogate = -1;
531+
}
532+
else if (*hi_surrogate != -1)
533+
{
534+
ereport(ERROR,
535+
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
536+
errmsg("invalid input syntax for type jsonpath"),
537+
errdetail("Unicode low surrogate must follow a high surrogate.")));
538+
}
539+
540+
addUnicodeChar(ch);
541+
}
542+
439543
/*
440544
* parseUnicode was adopted from json_lex_string() in
441545
* src/backend/utils/adt/json.c
442546
*/
443547
static void
444548
parseUnicode(char *s, int l)
445549
{
446-
int i, j;
447-
int ch = 0;
448-
int hi_surrogate = -1;
449-
450-
Assert(l % 6 /* \uXXXX */ == 0);
550+
int i;
551+
int hi_surrogate = -1;
451552

452-
for(i = 0; i < l / 6; i++)
553+
for (i = 2; i < l; i += 2) /* skip '\u' */
453554
{
454-
ch = 0;
455-
456-
for(j=0; j<4; j++)
457-
ch = (ch << 4) | hexval(s[ i*6 + 2 + j]);
555+
int ch = 0;
556+
int j;
458557

459-
if (ch >= 0xd800 && ch <= 0xdbff)
558+
if (s[i] == '{') /* parse '\u{XX...}' */
460559
{
461-
if (hi_surrogate != -1)
462-
ereport(ERROR,
463-
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
464-
errmsg("invalid input syntax for type jsonpath"),
465-
errdetail("Unicode high surrogate must not follow a high surrogate.")));
466-
hi_surrogate = (ch & 0x3ff) << 10;
467-
continue;
560+
while (s[++i] != '}' && i < l)
561+
ch = (ch << 4) | hexval(s[i]);
562+
i++; /* ski p '}' */
468563
}
469-
else if (ch >= 0xdc00 && ch <= 0xdfff)
564+
else /* parse '\uXXXX' */
470565
{
471-
if (hi_surrogate == -1)
472-
ereport(ERROR,
473-
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
474-
errmsg("invalid input syntax for type jsonpath"),
475-
errdetail("Unicode low surrogate must follow a high surrogate.")));
476-
ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
477-
hi_surrogate = -1;
566+
for (j = 0; j < 4 && i < l; j++)
567+
ch = (ch << 4) | hexval(s[i++]);
478568
}
479569

480-
if (hi_surrogate != -1)
481-
ereport(ERROR,
482-
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
483-
errmsg("invalid input syntax for type jsonpath"),
484-
errdetail("Unicode low surrogate must follow a high surrogate.")));
570+
addUnicode(ch, &hi_surrogate);
571+
}
485572

486-
/*
487-
* For UTF8, replace the escape sequence by the actual
488-
* utf8 character in lex->strval. Do this also for other
489-
* encodings if the escape designates an ASCII character,
490-
* otherwise raise an error.
491-
*/
573+
if (hi_surrogate != -1)
574+
{
575+
ereport(ERROR,
576+
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
577+
errmsg("invalid input syntax for type jsonpath"),
578+
errdetail("Unicode low surrogate must follow a high surrogate.")));
579+
}
580+
}
492581

493-
if (ch == 0)
494-
{
495-
/* We can't allow this, since our TEXT type doesn't */
496-
ereport(ERROR,
497-
(errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
498-
errmsg("unsupported Unicode escape sequence"),
499-
errdetail("\\u0000 cannot be converted to text.")));
500-
}
501-
else if (GetDatabaseEncoding() == PG_UTF8)
502-
{
503-
char utf8str[5];
504-
int utf8len;
582+
static void
583+
parseHexChars(char *s, int l)
584+
{
585+
int i;
505586

506-
unicode_to_utf8(ch, (unsigned char *) utf8str);
507-
utf8len = pg_utf_mblen((unsigned char *) utf8str);
508-
addstring(false, utf8str, utf8len);
509-
}
510-
else if (ch <= 0x007f)
511-
{
512-
/*
513-
* This is the only way to designate things like a
514-
* form feed character in JSON, so it's useful in all
515-
* encodings.
516-
*/
517-
addchar(false, (char) ch);
518-
}
519-
else
520-
{
521-
ereport(ERROR,
522-
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
523-
errmsg("invalid input syntax for type jsonpath"),
524-
errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8.")));
525-
}
587+
Assert(l % 4 /* \xXX */ == 0);
588+
589+
for (i = 0; i < l / 4; i++)
590+
{
591+
int ch = (hexval(s[i * 4 + 2]) << 4) | hexval(s[i * 4 + 3]);
526592

527-
hi_surrogate = -1;
593+
addUnicodeChar(ch);
528594
}
529595
}
530596

src/test/regress/expected/jsonpath.out

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,36 @@ select '$.a/+-1'::jsonpath;
147147
($."a" / -1)
148148
(1 row)
149149

150+
select '"\b\f\r\n\t\v\"\''\\"'::jsonpath;
151+
jsonpath
152+
-------------------------
153+
"\b\f\r\n\t\u000b\"'\\"
154+
(1 row)
155+
156+
select '''\b\f\r\n\t\v\"\''\\'''::jsonpath;
157+
jsonpath
158+
-------------------------
159+
"\b\f\r\n\t\u000b\"'\\"
160+
(1 row)
161+
162+
select '"\x50\u0067\u{53}\u{051}\u{00004C}"'::jsonpath;
163+
jsonpath
164+
----------
165+
"PgSQL"
166+
(1 row)
167+
168+
select '''\x50\u0067\u{53}\u{051}\u{00004C}'''::jsonpath;
169+
jsonpath
170+
----------
171+
"PgSQL"
172+
(1 row)
173+
174+
select '$.foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar'::jsonpath;
175+
jsonpath
176+
---------------------
177+
$."fooPgSQL\t\"bar"
178+
(1 row)
179+
150180
select '$.g ? ($.a == 1)'::jsonpath;
151181
jsonpath
152182
--------------------

src/test/regress/sql/jsonpath.sql

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@ select '$-1'::jsonpath;
2626
select '$--+1'::jsonpath;
2727
select '$.a/+-1'::jsonpath;
2828

29+
select '"\b\f\r\n\t\v\"\''\\"'::jsonpath;
30+
select '''\b\f\r\n\t\v\"\''\\'''::jsonpath;
31+
select '"\x50\u0067\u{53}\u{051}\u{00004C}"'::jsonpath;
32+
select '''\x50\u0067\u{53}\u{051}\u{00004C}'''::jsonpath;
33+
select '$.foo\x50\u0067\u{53}\u{051}\u{00004C}\t\"bar'::jsonpath;
34+
2935
select '$.g ? ($.a == 1)'::jsonpath;
3036
select '$.g ? (@ == 1)'::jsonpath;
3137
select '$.g ? (.a == 1)'::jsonpath;

0 commit comments

Comments
 (0)