Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7f380c5

Browse files
committed
Reduce size of backend scanner's tables.
Previously, the core scanner's yy_transition[] array had 37045 elements. Since that number is larger than INT16_MAX, Flex generated the array to contain 32-bit integers. By reimplementing some of the bulkier scanner rules, this patch reduces the array to 20495 elements. The much smaller total length, combined with the consequent use of 16-bit integers for the array elements reduces the binary size by over 200kB. This was accomplished in two ways: 1. Consolidate handling of quote continuations into a new start condition, rather than duplicating that logic for five different string types. 2. Treat Unicode strings and identifiers followed by a UESCAPE sequence as three separate tokens, rather than one. The logic to de-escape Unicode strings is moved to the filter code in parser.c, which already had the ability to provide special processing for token sequences. While we could have implemented the conversion in the grammar, that approach was rejected for performance and maintainability reasons. Performance in microbenchmarks of raw parsing seems equal or slightly faster in most cases, and it's reasonable to expect that in real-world usage (with more competition for the CPU cache) there will be a larger win. The exception is UESCAPE sequences; lexing those is about 10% slower, primarily because the scanner now has to be called three times rather than one. This seems acceptable since that feature is very rarely used. The psql and epcg lexers are likewise modified, primarily because we want to keep them all in sync. Since those lexers don't use the space-hogging -CF option, the space savings is much less, but it's still good for perhaps 10kB apiece. While at it, merge the ecpg lexer's handling of C-style comments used in SQL and in C. Those have different rules regarding nested comments, but since we already have the ability to keep track of the previous start condition, we can use that to handle both cases within a single start condition. This matches the core scanner more closely. John Naylor Discussion: https://postgr.es/m/CACPNZCvaoa3EgVWm5yZhcSTX6RAtaLgniCPcBVOCwm8h3xpWkw@mail.gmail.com
1 parent 259bbe1 commit 7f380c5

File tree

19 files changed

+671
-619
lines changed

19 files changed

+671
-619
lines changed

src/backend/parser/gram.y

+7-3
Original file line numberDiff line numberDiff line change
@@ -598,10 +598,13 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
598598
* the set of keywords. PL/pgSQL depends on this so that it can share the
599599
* same lexer. If you add/change tokens here, fix PL/pgSQL to match!
600600
*
601+
* UIDENT and USCONST are reduced to IDENT and SCONST in parser.c, so that
602+
* they need no productions here; but we must assign token codes to them.
603+
*
601604
* DOT_DOT is unused in the core SQL grammar, and so will always provoke
602605
* parse errors. It is needed by PL/pgSQL.
603606
*/
604-
%token <str> IDENT FCONST SCONST BCONST XCONST Op
607+
%token <str> IDENT UIDENT FCONST SCONST USCONST BCONST XCONST Op
605608
%token <ival> ICONST PARAM
606609
%token TYPECAST DOT_DOT COLON_EQUALS EQUALS_GREATER
607610
%token LESS_EQUALS GREATER_EQUALS NOT_EQUALS
@@ -691,8 +694,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
691694
TREAT TRIGGER TRIM TRUE_P
692695
TRUNCATE TRUSTED TYPE_P TYPES_P
693696

694-
UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED
695-
UNTIL UPDATE USER USING
697+
UESCAPE UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN
698+
UNLISTEN UNLOGGED UNTIL UPDATE USER USING
696699

697700
VACUUM VALID VALIDATE VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING
698701
VERBOSE VERSION_P VIEW VIEWS VOLATILE
@@ -15374,6 +15377,7 @@ unreserved_keyword:
1537415377
| TRUSTED
1537515378
| TYPE_P
1537615379
| TYPES_P
15380+
| UESCAPE
1537715381
| UNBOUNDED
1537815382
| UNCOMMITTED
1537915383
| UNENCRYPTED

src/backend/parser/parser.c

+281-1
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,14 @@
2121

2222
#include "postgres.h"
2323

24+
#include "mb/pg_wchar.h"
2425
#include "parser/gramparse.h"
2526
#include "parser/parser.h"
27+
#include "parser/scansup.h"
28+
29+
static bool check_uescapechar(unsigned char escape);
30+
static char *str_udeescape(const char *str, char escape,
31+
int position, core_yyscan_t yyscanner);
2632

2733

2834
/*
@@ -75,6 +81,10 @@ raw_parser(const char *str)
7581
* scanner backtrack, which would cost more performance than this filter
7682
* layer does.
7783
*
84+
* We also use this filter to convert UIDENT and USCONST sequences into
85+
* plain IDENT and SCONST tokens. While that could be handled by additional
86+
* productions in the main grammar, it's more efficient to do it like this.
87+
*
7888
* The filter also provides a convenient place to translate between
7989
* the core_YYSTYPE and YYSTYPE representations (which are really the
8090
* same thing anyway, but notationally they're different).
@@ -104,7 +114,7 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
104114
* If this token isn't one that requires lookahead, just return it. If it
105115
* does, determine the token length. (We could get that via strlen(), but
106116
* since we have such a small set of possibilities, hardwiring seems
107-
* feasible and more efficient.)
117+
* feasible and more efficient --- at least for the fixed-length cases.)
108118
*/
109119
switch (cur_token)
110120
{
@@ -117,6 +127,10 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
117127
case WITH:
118128
cur_token_length = 4;
119129
break;
130+
case UIDENT:
131+
case USCONST:
132+
cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
133+
break;
120134
default:
121135
return cur_token;
122136
}
@@ -190,7 +204,273 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
190204
break;
191205
}
192206
break;
207+
208+
case UIDENT:
209+
case USCONST:
210+
/* Look ahead for UESCAPE */
211+
if (next_token == UESCAPE)
212+
{
213+
/* Yup, so get third token, which had better be SCONST */
214+
const char *escstr;
215+
216+
/* Again save and restore *llocp */
217+
cur_yylloc = *llocp;
218+
219+
/* Un-truncate current token so errors point to third token */
220+
*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
221+
222+
/* Get third token */
223+
next_token = core_yylex(&(yyextra->lookahead_yylval),
224+
llocp, yyscanner);
225+
226+
/* If we throw error here, it will point to third token */
227+
if (next_token != SCONST)
228+
scanner_yyerror("UESCAPE must be followed by a simple string literal",
229+
yyscanner);
230+
231+
escstr = yyextra->lookahead_yylval.str;
232+
if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
233+
scanner_yyerror("invalid Unicode escape character",
234+
yyscanner);
235+
236+
/* Now restore *llocp; errors will point to first token */
237+
*llocp = cur_yylloc;
238+
239+
/* Apply Unicode conversion */
240+
lvalp->core_yystype.str =
241+
str_udeescape(lvalp->core_yystype.str,
242+
escstr[0],
243+
*llocp,
244+
yyscanner);
245+
246+
/*
247+
* We don't need to revert the un-truncation of UESCAPE. What
248+
* we do want to do is clear have_lookahead, thereby consuming
249+
* all three tokens.
250+
*/
251+
yyextra->have_lookahead = false;
252+
}
253+
else
254+
{
255+
/* No UESCAPE, so convert using default escape character */
256+
lvalp->core_yystype.str =
257+
str_udeescape(lvalp->core_yystype.str,
258+
'\\',
259+
*llocp,
260+
yyscanner);
261+
}
262+
263+
if (cur_token == UIDENT)
264+
{
265+
/* It's an identifier, so truncate as appropriate */
266+
truncate_identifier(lvalp->core_yystype.str,
267+
strlen(lvalp->core_yystype.str),
268+
true);
269+
cur_token = IDENT;
270+
}
271+
else if (cur_token == USCONST)
272+
{
273+
cur_token = SCONST;
274+
}
275+
break;
193276
}
194277

195278
return cur_token;
196279
}
280+
281+
/* convert hex digit (caller should have verified that) to value */
282+
static unsigned int
283+
hexval(unsigned char c)
284+
{
285+
if (c >= '0' && c <= '9')
286+
return c - '0';
287+
if (c >= 'a' && c <= 'f')
288+
return c - 'a' + 0xA;
289+
if (c >= 'A' && c <= 'F')
290+
return c - 'A' + 0xA;
291+
elog(ERROR, "invalid hexadecimal digit");
292+
return 0; /* not reached */
293+
}
294+
295+
/* is Unicode code point acceptable in database's encoding? */
296+
static void
297+
check_unicode_value(pg_wchar c, int pos, core_yyscan_t yyscanner)
298+
{
299+
/* See also addunicode() in scan.l */
300+
if (c == 0 || c > 0x10FFFF)
301+
ereport(ERROR,
302+
(errcode(ERRCODE_SYNTAX_ERROR),
303+
errmsg("invalid Unicode escape value"),
304+
scanner_errposition(pos, yyscanner)));
305+
306+
if (c > 0x7F && GetDatabaseEncoding() != PG_UTF8)
307+
ereport(ERROR,
308+
(errcode(ERRCODE_SYNTAX_ERROR),
309+
errmsg("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"),
310+
scanner_errposition(pos, yyscanner)));
311+
}
312+
313+
/* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
314+
static bool
315+
check_uescapechar(unsigned char escape)
316+
{
317+
if (isxdigit(escape)
318+
|| escape == '+'
319+
|| escape == '\''
320+
|| escape == '"'
321+
|| scanner_isspace(escape))
322+
return false;
323+
else
324+
return true;
325+
}
326+
327+
/*
328+
* Process Unicode escapes in "str", producing a palloc'd plain string
329+
*
330+
* escape: the escape character to use
331+
* position: start position of U&'' or U&"" string token
332+
* yyscanner: context information needed for error reports
333+
*/
334+
static char *
335+
str_udeescape(const char *str, char escape,
336+
int position, core_yyscan_t yyscanner)
337+
{
338+
const char *in;
339+
char *new,
340+
*out;
341+
pg_wchar pair_first = 0;
342+
343+
/*
344+
* This relies on the subtle assumption that a UTF-8 expansion cannot be
345+
* longer than its escaped representation.
346+
*/
347+
new = palloc(strlen(str) + 1);
348+
349+
in = str;
350+
out = new;
351+
while (*in)
352+
{
353+
if (in[0] == escape)
354+
{
355+
if (in[1] == escape)
356+
{
357+
if (pair_first)
358+
goto invalid_pair;
359+
*out++ = escape;
360+
in += 2;
361+
}
362+
else if (isxdigit((unsigned char) in[1]) &&
363+
isxdigit((unsigned char) in[2]) &&
364+
isxdigit((unsigned char) in[3]) &&
365+
isxdigit((unsigned char) in[4]))
366+
{
367+
pg_wchar unicode;
368+
369+
unicode = (hexval(in[1]) << 12) +
370+
(hexval(in[2]) << 8) +
371+
(hexval(in[3]) << 4) +
372+
hexval(in[4]);
373+
check_unicode_value(unicode,
374+
in - str + position + 3, /* 3 for U&" */
375+
yyscanner);
376+
if (pair_first)
377+
{
378+
if (is_utf16_surrogate_second(unicode))
379+
{
380+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
381+
pair_first = 0;
382+
}
383+
else
384+
goto invalid_pair;
385+
}
386+
else if (is_utf16_surrogate_second(unicode))
387+
goto invalid_pair;
388+
389+
if (is_utf16_surrogate_first(unicode))
390+
pair_first = unicode;
391+
else
392+
{
393+
unicode_to_utf8(unicode, (unsigned char *) out);
394+
out += pg_mblen(out);
395+
}
396+
in += 5;
397+
}
398+
else if (in[1] == '+' &&
399+
isxdigit((unsigned char) in[2]) &&
400+
isxdigit((unsigned char) in[3]) &&
401+
isxdigit((unsigned char) in[4]) &&
402+
isxdigit((unsigned char) in[5]) &&
403+
isxdigit((unsigned char) in[6]) &&
404+
isxdigit((unsigned char) in[7]))
405+
{
406+
pg_wchar unicode;
407+
408+
unicode = (hexval(in[2]) << 20) +
409+
(hexval(in[3]) << 16) +
410+
(hexval(in[4]) << 12) +
411+
(hexval(in[5]) << 8) +
412+
(hexval(in[6]) << 4) +
413+
hexval(in[7]);
414+
check_unicode_value(unicode,
415+
in - str + position + 3, /* 3 for U&" */
416+
yyscanner);
417+
if (pair_first)
418+
{
419+
if (is_utf16_surrogate_second(unicode))
420+
{
421+
unicode = surrogate_pair_to_codepoint(pair_first, unicode);
422+
pair_first = 0;
423+
}
424+
else
425+
goto invalid_pair;
426+
}
427+
else if (is_utf16_surrogate_second(unicode))
428+
goto invalid_pair;
429+
430+
if (is_utf16_surrogate_first(unicode))
431+
pair_first = unicode;
432+
else
433+
{
434+
unicode_to_utf8(unicode, (unsigned char *) out);
435+
out += pg_mblen(out);
436+
}
437+
in += 8;
438+
}
439+
else
440+
ereport(ERROR,
441+
(errcode(ERRCODE_SYNTAX_ERROR),
442+
errmsg("invalid Unicode escape value"),
443+
scanner_errposition(in - str + position + 3, /* 3 for U&" */
444+
yyscanner)));
445+
}
446+
else
447+
{
448+
if (pair_first)
449+
goto invalid_pair;
450+
451+
*out++ = *in++;
452+
}
453+
}
454+
455+
/* unfinished surrogate pair? */
456+
if (pair_first)
457+
goto invalid_pair;
458+
459+
*out = '\0';
460+
461+
/*
462+
* We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
463+
* codes; but it's probably not worth the trouble, since this isn't likely
464+
* to be a performance-critical path.
465+
*/
466+
pg_verifymbstr(new, out - new, false);
467+
return new;
468+
469+
invalid_pair:
470+
ereport(ERROR,
471+
(errcode(ERRCODE_SYNTAX_ERROR),
472+
errmsg("invalid Unicode surrogate pair"),
473+
scanner_errposition(in - str + position + 3, /* 3 for U&" */
474+
yyscanner)));
475+
return NULL; /* keep compiler quiet */
476+
}

0 commit comments

Comments
 (0)