Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d809fd0

Browse files
committed
Improve parser's one-extra-token lookahead mechanism.
There are a couple of places in our grammar that fail to be strict LALR(1), by requiring more than a single token of lookahead to decide what to do. Up to now we've dealt with that by using a filter between the lexer and parser that merges adjacent tokens into one in the places where two tokens of lookahead are necessary. But that creates a number of user-visible anomalies, for instance that you can't name a CTE "ordinality" because "WITH ordinality AS ..." triggers folding of WITH and ORDINALITY into one token. I realized that there's a better way. In this patch, we still do the lookahead basically as before, but we never merge the second token into the first; we replace just the first token by a special lookahead symbol when one of the lookahead pairs is seen. This requires a couple extra productions in the grammar, but it involves fewer special tokens, so that the grammar tables come out a bit smaller than before. The filter logic is no slower than before, perhaps a bit faster. I also fixed the filter logic so that when backing up after a lookahead, the current token's terminator is correctly restored; this eliminates some weird behavior in error message issuance, as is shown by the one change in existing regression test outputs. I believe that this patch entirely eliminates odd behaviors caused by lookahead for WITH. It doesn't really improve the situation for NULLS followed by FIRST/LAST unfortunately: those sequences still act like a reserved word, even though there are cases where they should be seen as two ordinary identifiers, eg "SELECT nulls first FROM ...". I experimented with additional grammar hacks but couldn't find any simple solution for that. Still, this is better than before, and it seems much more likely that we *could* somehow solve the NULLS case on the basis of this filter behavior than the previous one.
1 parent 23a7835 commit d809fd0

File tree

8 files changed

+173
-113
lines changed

8 files changed

+173
-113
lines changed

src/backend/parser/gram.y

+25-10
Original file line numberDiff line numberDiff line change
@@ -633,9 +633,9 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
633633
/*
634634
* The grammar thinks these are keywords, but they are not in the kwlist.h
635635
* list and so can never be entered directly. The filter in parser.c
636-
* creates these tokens when required.
636+
* creates these tokens when required (based on looking one token ahead).
637637
*/
638-
%token NULLS_FIRST NULLS_LAST WITH_ORDINALITY WITH_TIME
638+
%token NULLS_LA WITH_LA
639639

640640

641641
/* Precedence: lowest to highest */
@@ -873,6 +873,7 @@ CreateRoleStmt:
873873

874874

875875
opt_with: WITH {}
876+
| WITH_LA {}
876877
| /*EMPTY*/ {}
877878
;
878879

@@ -6673,8 +6674,8 @@ opt_asc_desc: ASC { $$ = SORTBY_ASC; }
66736674
| /*EMPTY*/ { $$ = SORTBY_DEFAULT; }
66746675
;
66756676

6676-
opt_nulls_order: NULLS_FIRST { $$ = SORTBY_NULLS_FIRST; }
6677-
| NULLS_LAST { $$ = SORTBY_NULLS_LAST; }
6677+
opt_nulls_order: NULLS_LA FIRST_P { $$ = SORTBY_NULLS_FIRST; }
6678+
| NULLS_LA LAST_P { $$ = SORTBY_NULLS_LAST; }
66786679
| /*EMPTY*/ { $$ = SORTBY_NULLS_DEFAULT; }
66796680
;
66806681

@@ -8923,7 +8924,7 @@ AlterTSDictionaryStmt:
89238924
;
89248925

89258926
AlterTSConfigurationStmt:
8926-
ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list WITH any_name_list
8927+
ALTER TEXT_P SEARCH CONFIGURATION any_name ADD_P MAPPING FOR name_list any_with any_name_list
89278928
{
89288929
AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
89298930
n->cfgname = $5;
@@ -8933,7 +8934,7 @@ AlterTSConfigurationStmt:
89338934
n->replace = false;
89348935
$$ = (Node*)n;
89358936
}
8936-
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list WITH any_name_list
8937+
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list any_with any_name_list
89378938
{
89388939
AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
89398940
n->cfgname = $5;
@@ -8943,7 +8944,7 @@ AlterTSConfigurationStmt:
89438944
n->replace = false;
89448945
$$ = (Node*)n;
89458946
}
8946-
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING REPLACE any_name WITH any_name
8947+
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING REPLACE any_name any_with any_name
89478948
{
89488949
AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
89498950
n->cfgname = $5;
@@ -8953,7 +8954,7 @@ AlterTSConfigurationStmt:
89538954
n->replace = true;
89548955
$$ = (Node*)n;
89558956
}
8956-
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list REPLACE any_name WITH any_name
8957+
| ALTER TEXT_P SEARCH CONFIGURATION any_name ALTER MAPPING FOR name_list REPLACE any_name any_with any_name
89578958
{
89588959
AlterTSConfigurationStmt *n = makeNode(AlterTSConfigurationStmt);
89598960
n->cfgname = $5;
@@ -8981,6 +8982,11 @@ AlterTSConfigurationStmt:
89818982
}
89828983
;
89838984

8985+
/* Use this if TIME or ORDINALITY after WITH should be taken as an identifier */
8986+
any_with: WITH {}
8987+
| WITH_LA {}
8988+
;
8989+
89848990

89858991
/*****************************************************************************
89868992
*
@@ -9891,6 +9897,8 @@ simple_select:
98919897
* AS (query) [ SEARCH or CYCLE clause ]
98929898
*
98939899
* We don't currently support the SEARCH or CYCLE clause.
9900+
*
9901+
* Recognizing WITH_LA here allows a CTE to be named TIME or ORDINALITY.
98949902
*/
98959903
with_clause:
98969904
WITH cte_list
@@ -9900,6 +9908,13 @@ with_clause:
99009908
$$->recursive = false;
99019909
$$->location = @1;
99029910
}
9911+
| WITH_LA cte_list
9912+
{
9913+
$$ = makeNode(WithClause);
9914+
$$->ctes = $2;
9915+
$$->recursive = false;
9916+
$$->location = @1;
9917+
}
99039918
| WITH RECURSIVE cte_list
99049919
{
99059920
$$ = makeNode(WithClause);
@@ -10601,7 +10616,7 @@ opt_col_def_list: AS '(' TableFuncElementList ')' { $$ = $3; }
1060110616
| /*EMPTY*/ { $$ = NIL; }
1060210617
;
1060310618

10604-
opt_ordinality: WITH_ORDINALITY { $$ = true; }
10619+
opt_ordinality: WITH_LA ORDINALITY { $$ = true; }
1060510620
| /*EMPTY*/ { $$ = false; }
1060610621
;
1060710622

@@ -11057,7 +11072,7 @@ ConstInterval:
1105711072
;
1105811073

1105911074
opt_timezone:
11060-
WITH_TIME ZONE { $$ = TRUE; }
11075+
WITH_LA TIME ZONE { $$ = TRUE; }
1106111076
| WITHOUT TIME ZONE { $$ = FALSE; }
1106211077
| /*EMPTY*/ { $$ = FALSE; }
1106311078
;

src/backend/parser/parser.c

+58-47
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,13 @@ raw_parser(const char *str)
6464
/*
6565
* Intermediate filter between parser and core lexer (core_yylex in scan.l).
6666
*
67-
* The filter is needed because in some cases the standard SQL grammar
67+
* This filter is needed because in some cases the standard SQL grammar
6868
* requires more than one token lookahead. We reduce these cases to one-token
69-
* lookahead by combining tokens here, in order to keep the grammar LALR(1).
69+
* lookahead by replacing tokens here, in order to keep the grammar LALR(1).
7070
*
7171
* Using a filter is simpler than trying to recognize multiword tokens
7272
* directly in scan.l, because we'd have to allow for comments between the
73-
* words. Furthermore it's not clear how to do it without re-introducing
73+
* words. Furthermore it's not clear how to do that without re-introducing
7474
* scanner backtrack, which would cost more performance than this filter
7575
* layer does.
7676
*
@@ -84,7 +84,7 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
8484
base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
8585
int cur_token;
8686
int next_token;
87-
core_YYSTYPE cur_yylval;
87+
int cur_token_length;
8888
YYLTYPE cur_yylloc;
8989

9090
/* Get next token --- we might already have it */
@@ -93,74 +93,85 @@ base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
9393
cur_token = yyextra->lookahead_token;
9494
lvalp->core_yystype = yyextra->lookahead_yylval;
9595
*llocp = yyextra->lookahead_yylloc;
96+
*(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
9697
yyextra->have_lookahead = false;
9798
}
9899
else
99100
cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
100101

101-
/* Do we need to look ahead for a possible multiword token? */
102+
/*
103+
* If this token isn't one that requires lookahead, just return it. If it
104+
* does, determine the token length. (We could get that via strlen(), but
105+
* since we have such a small set of possibilities, hardwiring seems
106+
* feasible and more efficient.)
107+
*/
102108
switch (cur_token)
103109
{
104110
case NULLS_P:
111+
cur_token_length = 5;
112+
break;
113+
case WITH:
114+
cur_token_length = 4;
115+
break;
116+
default:
117+
return cur_token;
118+
}
105119

106-
/*
107-
* NULLS FIRST and NULLS LAST must be reduced to one token
108-
*/
109-
cur_yylval = lvalp->core_yystype;
110-
cur_yylloc = *llocp;
111-
next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
120+
/*
121+
* Identify end+1 of current token. core_yylex() has temporarily stored a
122+
* '\0' here, and will undo that when we call it again. We need to redo
123+
* it to fully revert the lookahead call for error reporting purposes.
124+
*/
125+
yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
126+
*llocp + cur_token_length;
127+
Assert(*(yyextra->lookahead_end) == '\0');
128+
129+
/*
130+
* Save and restore *llocp around the call. It might look like we could
131+
* avoid this by just passing &lookahead_yylloc to core_yylex(), but that
132+
* does not work because flex actually holds onto the last-passed pointer
133+
* internally, and will use that for error reporting. We need any error
134+
* reports to point to the current token, not the next one.
135+
*/
136+
cur_yylloc = *llocp;
137+
138+
/* Get next token, saving outputs into lookahead variables */
139+
next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
140+
yyextra->lookahead_token = next_token;
141+
yyextra->lookahead_yylloc = *llocp;
142+
143+
*llocp = cur_yylloc;
144+
145+
/* Now revert the un-truncation of the current token */
146+
yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
147+
*(yyextra->lookahead_end) = '\0';
148+
149+
yyextra->have_lookahead = true;
150+
151+
/* Replace cur_token if needed, based on lookahead */
152+
switch (cur_token)
153+
{
154+
case NULLS_P:
155+
/* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
112156
switch (next_token)
113157
{
114158
case FIRST_P:
115-
cur_token = NULLS_FIRST;
116-
break;
117159
case LAST_P:
118-
cur_token = NULLS_LAST;
119-
break;
120-
default:
121-
/* save the lookahead token for next time */
122-
yyextra->lookahead_token = next_token;
123-
yyextra->lookahead_yylval = lvalp->core_yystype;
124-
yyextra->lookahead_yylloc = *llocp;
125-
yyextra->have_lookahead = true;
126-
/* and back up the output info to cur_token */
127-
lvalp->core_yystype = cur_yylval;
128-
*llocp = cur_yylloc;
160+
cur_token = NULLS_LA;
129161
break;
130162
}
131163
break;
132164

133165
case WITH:
134-
135-
/*
136-
* WITH TIME and WITH ORDINALITY must each be reduced to one token
137-
*/
138-
cur_yylval = lvalp->core_yystype;
139-
cur_yylloc = *llocp;
140-
next_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
166+
/* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
141167
switch (next_token)
142168
{
143169
case TIME:
144-
cur_token = WITH_TIME;
145-
break;
146170
case ORDINALITY:
147-
cur_token = WITH_ORDINALITY;
148-
break;
149-
default:
150-
/* save the lookahead token for next time */
151-
yyextra->lookahead_token = next_token;
152-
yyextra->lookahead_yylval = lvalp->core_yystype;
153-
yyextra->lookahead_yylloc = *llocp;
154-
yyextra->have_lookahead = true;
155-
/* and back up the output info to cur_token */
156-
lvalp->core_yystype = cur_yylval;
157-
*llocp = cur_yylloc;
171+
cur_token = WITH_LA;
158172
break;
159173
}
160174
break;
161-
162-
default:
163-
break;
164175
}
165176

166177
return cur_token;

src/include/parser/gramparse.h

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ typedef struct base_yy_extra_type
4646
int lookahead_token; /* one-token lookahead */
4747
core_YYSTYPE lookahead_yylval; /* yylval for lookahead token */
4848
YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
49+
char *lookahead_end; /* end of current token */
50+
char lookahead_hold_char; /* to be put back at *lookahead_end */
4951

5052
/*
5153
* State variables that belong to the grammar.

src/interfaces/ecpg/preproc/parse.pl

+2-4
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,8 @@
4242

4343
# or in the block
4444
my %replace_string = (
45-
'WITH_TIME' => 'with time',
46-
'WITH_ORDINALITY' => 'with ordinality',
47-
'NULLS_FIRST' => 'nulls first',
48-
'NULLS_LAST' => 'nulls last',
45+
'NULLS_LA' => 'nulls',
46+
'WITH_LA' => 'with',
4947
'TYPECAST' => '::',
5048
'DOT_DOT' => '..',
5149
'COLON_EQUALS' => ':=',);

0 commit comments

Comments
 (0)