Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c8c74ad

Browse files
committed
Get rid of O(N^2) script-parsing overhead in pgbench.
pgbench wants to record the starting line number of each command in its scripts. It was computing that by scanning from the script start and counting newlines, so that O(N^2) work had to be done for an N-command script. In a script with 50K lines, this adds up to about 10 seconds on my machine. To add insult to injury, the results were subtly wrong, because expr_scanner_offset() scanned to find the NUL that flex inserts at the end of the current token --- and before the first yylex call, no such NUL has been inserted. So we ended by computing the script's last line number not its first one. This was visible only in case of \gset at the start of a script, which perhaps accounts for the lack of complaints. To fix, steal an idea from plpgsql and track the current lexer ending position and line count as we advance through the script. (It's a bit simpler than plpgsql since we can't need to back up.) Also adjust a couple of other places that were invoking scans from script start when they didn't really need to. I made a new psqlscan function psql_scan_get_location() that replaces both expr_scanner_offset() and expr_scanner_get_lineno(), since in practice expr_scanner_get_lineno() was only being invoked to find the line number of the current lexer end position. Reported-by: Daniel Vérité <daniel@manitou-mail.org> Author: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/84a8a89e-adb8-47a9-9d34-c13f7150ee45@manitou-mail.org
1 parent e167191 commit c8c74ad

File tree

6 files changed

+89
-55
lines changed

6 files changed

+89
-55
lines changed

src/bin/pgbench/exprscan.l

+21-44
Original file line numberDiff line numberDiff line change
@@ -271,10 +271,14 @@ void
271271
expr_yyerror_more(yyscan_t yyscanner, const char *message, const char *more)
272272
{
273273
PsqlScanState state = yyget_extra(yyscanner);
274-
int error_detection_offset = expr_scanner_offset(state) - 1;
274+
int lineno;
275+
int error_detection_offset;
275276
YYSTYPE lval;
276277
char *full_line;
277278

279+
psql_scan_get_location(state, &lineno, &error_detection_offset);
280+
error_detection_offset--;
281+
278282
/*
279283
* While parsing an expression, we may not have collected the whole line
280284
* yet from the input source. Lex till EOL so we can report whole line.
@@ -289,7 +293,6 @@ expr_yyerror_more(yyscan_t yyscanner, const char *message, const char *more)
289293
/* Extract the line, trimming trailing newline if any */
290294
full_line = expr_scanner_get_substring(state,
291295
expr_start_offset,
292-
expr_scanner_offset(state),
293296
true);
294297

295298
syntax_error(expr_source, expr_lineno, full_line, expr_command,
@@ -336,12 +339,15 @@ expr_lex_one_word(PsqlScanState state, PQExpBuffer word_buf, int *offset)
336339
/* And lex. */
337340
lexresult = yylex(&lval, state->scanner);
338341

339-
/*
340-
* Save start offset of word, if any. We could do this more efficiently,
341-
* but for now this seems fine.
342-
*/
342+
/* Save start offset of word, if any. */
343343
if (lexresult)
344-
*offset = expr_scanner_offset(state) - word_buf->len;
344+
{
345+
int lineno;
346+
int end_offset;
347+
348+
psql_scan_get_location(state, &lineno, &end_offset);
349+
*offset = end_offset - word_buf->len;
350+
}
345351
else
346352
*offset = -1;
347353

@@ -404,35 +410,25 @@ expr_scanner_finish(yyscan_t yyscanner)
404410
}
405411

406412
/*
407-
* Get offset from start of string to end of current lexer token.
413+
* Get a malloc'd copy of the lexer input string from start_offset
414+
* to end of current lexer token. If chomp is true, drop any trailing
415+
* newline(s).
408416
*
409417
* We rely on the knowledge that flex modifies the scan buffer by storing
410418
* a NUL at the end of the current token (yytext). Note that this might
411419
* not work quite right if we were parsing a sub-buffer, but since pgbench
412-
* never invokes that functionality, it doesn't matter.
413-
*/
414-
int
415-
expr_scanner_offset(PsqlScanState state)
416-
{
417-
return strlen(state->scanbuf);
418-
}
419-
420-
/*
421-
* Get a malloc'd copy of the lexer input string from start_offset
422-
* to just before end_offset. If chomp is true, drop any trailing
423-
* newline(s).
420+
* never invokes that functionality, it doesn't matter. Also, this will
421+
* give the wrong answer (the whole remainder of the input) if called
422+
* before any yylex() call has been done.
424423
*/
425424
char *
426425
expr_scanner_get_substring(PsqlScanState state,
427-
int start_offset, int end_offset,
426+
int start_offset,
428427
bool chomp)
429428
{
430429
char *result;
431430
const char *scanptr = state->scanbuf + start_offset;
432-
int slen = end_offset - start_offset;
433-
434-
Assert(slen >= 0);
435-
Assert(end_offset <= strlen(state->scanbuf));
431+
size_t slen = strlen(scanptr);
436432

437433
if (chomp)
438434
{
@@ -447,22 +443,3 @@ expr_scanner_get_substring(PsqlScanState state,
447443

448444
return result;
449445
}
450-
451-
/*
452-
* Get the line number associated with the given string offset
453-
* (which must not be past the end of where we've lexed to).
454-
*/
455-
int
456-
expr_scanner_get_lineno(PsqlScanState state, int offset)
457-
{
458-
int lineno = 1;
459-
const char *p = state->scanbuf;
460-
461-
while (*p && offset > 0)
462-
{
463-
if (*p == '\n')
464-
lineno++;
465-
p++, offset--;
466-
}
467-
return lineno;
468-
}

src/bin/pgbench/pgbench.c

+6-8
Original file line numberDiff line numberDiff line change
@@ -5690,8 +5690,8 @@ process_backslash_command(PsqlScanState sstate, const char *source)
56905690
initPQExpBuffer(&word_buf);
56915691

56925692
/* Remember location of the backslash */
5693-
start_offset = expr_scanner_offset(sstate) - 1;
5694-
lineno = expr_scanner_get_lineno(sstate, start_offset);
5693+
psql_scan_get_location(sstate, &lineno, &start_offset);
5694+
start_offset--;
56955695

56965696
/* Collect first word of command */
56975697
if (!expr_lex_one_word(sstate, &word_buf, &word_offset))
@@ -5747,7 +5747,6 @@ process_backslash_command(PsqlScanState sstate, const char *source)
57475747
my_command->first_line =
57485748
expr_scanner_get_substring(sstate,
57495749
start_offset,
5750-
expr_scanner_offset(sstate),
57515750
true);
57525751

57535752
expr_scanner_finish(yyscanner);
@@ -5777,7 +5776,6 @@ process_backslash_command(PsqlScanState sstate, const char *source)
57775776
my_command->first_line =
57785777
expr_scanner_get_substring(sstate,
57795778
start_offset,
5780-
expr_scanner_offset(sstate),
57815779
true);
57825780

57835781
if (my_command->meta == META_SLEEP)
@@ -5952,8 +5950,6 @@ ParseScript(const char *script, const char *desc, int weight)
59525950
PQExpBufferData line_buf;
59535951
int alloc_num;
59545952
int index;
5955-
int lineno;
5956-
int start_offset;
59575953

59585954
#define COMMANDS_ALLOC_NUM 128
59595955
alloc_num = COMMANDS_ALLOC_NUM;
@@ -5977,20 +5973,22 @@ ParseScript(const char *script, const char *desc, int weight)
59775973
* stdstrings should be true, which is a bit riskier.
59785974
*/
59795975
psql_scan_setup(sstate, script, strlen(script), 0, true);
5980-
start_offset = expr_scanner_offset(sstate) - 1;
59815976

59825977
initPQExpBuffer(&line_buf);
59835978

59845979
index = 0;
59855980

59865981
for (;;)
59875982
{
5983+
int lineno;
5984+
int start_offset;
59885985
PsqlScanResult sr;
59895986
promptStatus_t prompt;
59905987
Command *command = NULL;
59915988

59925989
resetPQExpBuffer(&line_buf);
5993-
lineno = expr_scanner_get_lineno(sstate, start_offset);
5990+
5991+
psql_scan_get_location(sstate, &lineno, &start_offset);
59945992

59955993
sr = psql_scan(sstate, &line_buf, &prompt);
59965994

src/bin/pgbench/pgbench.h

+1-3
Original file line numberDiff line numberDiff line change
@@ -149,11 +149,9 @@ extern yyscan_t expr_scanner_init(PsqlScanState state,
149149
const char *source, int lineno, int start_offset,
150150
const char *command);
151151
extern void expr_scanner_finish(yyscan_t yyscanner);
152-
extern int expr_scanner_offset(PsqlScanState state);
153152
extern char *expr_scanner_get_substring(PsqlScanState state,
154-
int start_offset, int end_offset,
153+
int start_offset,
155154
bool chomp);
156-
extern int expr_scanner_get_lineno(PsqlScanState state, int offset);
157155

158156
extern void syntax_error(const char *source, int lineno, const char *line,
159157
const char *command, const char *msg,

src/fe_utils/psqlscan.l

+54
Original file line numberDiff line numberDiff line change
@@ -1079,6 +1079,10 @@ psql_scan_setup(PsqlScanState state,
10791079
/* Set lookaside data in case we have to map unsafe encoding */
10801080
state->curline = state->scanbuf;
10811081
state->refline = state->scanline;
1082+
1083+
/* Initialize state for psql_scan_get_location() */
1084+
state->cur_line_no = 0; /* yylex not called yet */
1085+
state->cur_line_ptr = state->scanbuf;
10821086
}
10831087

10841088
/*
@@ -1136,6 +1140,10 @@ psql_scan(PsqlScanState state,
11361140
/* And lex. */
11371141
lexresult = yylex(NULL, state->scanner);
11381142

1143+
/* Notify psql_scan_get_location() that a yylex call has been made. */
1144+
if (state->cur_line_no == 0)
1145+
state->cur_line_no = 1;
1146+
11391147
/*
11401148
* Check termination state and return appropriate result info.
11411149
*/
@@ -1311,6 +1319,52 @@ psql_scan_in_quote(PsqlScanState state)
13111319
state->start_state != xqs;
13121320
}
13131321

1322+
/*
1323+
* Return the current scanning location (end+1 of last scanned token),
1324+
* as a line number counted from 1 and an offset from string start.
1325+
*
1326+
* This considers only the outermost input string, and therefore is of
1327+
* limited use for programs that use psqlscan_push_new_buffer().
1328+
*
1329+
* It would be a bit easier probably to use "%option yylineno" to count
1330+
* lines, but the flex manual says that has a performance cost, and only
1331+
* a minority of programs using psqlscan have need for this functionality.
1332+
* So we implement it ourselves without adding overhead to the lexer itself.
1333+
*/
1334+
void
1335+
psql_scan_get_location(PsqlScanState state,
1336+
int *lineno, int *offset)
1337+
{
1338+
const char *line_end;
1339+
1340+
/*
1341+
* We rely on flex's having stored a NUL after the current token in
1342+
* scanbuf. Therefore we must specially handle the state before yylex()
1343+
* has been called, when obviously that won't have happened yet.
1344+
*/
1345+
if (state->cur_line_no == 0)
1346+
{
1347+
*lineno = 1;
1348+
*offset = 0;
1349+
return;
1350+
}
1351+
1352+
/*
1353+
* Advance cur_line_no/cur_line_ptr past whatever has been lexed so far.
1354+
* Doing this prevents repeated calls from being O(N^2) for long inputs.
1355+
*/
1356+
while ((line_end = strchr(state->cur_line_ptr, '\n')) != NULL)
1357+
{
1358+
state->cur_line_no++;
1359+
state->cur_line_ptr = line_end + 1;
1360+
}
1361+
state->cur_line_ptr += strlen(state->cur_line_ptr);
1362+
1363+
/* Report current location. */
1364+
*lineno = state->cur_line_no;
1365+
*offset = state->cur_line_ptr - state->scanbuf;
1366+
}
1367+
13141368
/*
13151369
* Push the given string onto the stack of stuff to scan.
13161370
*

src/include/fe_utils/psqlscan.h

+3
Original file line numberDiff line numberDiff line change
@@ -87,4 +87,7 @@ extern void psql_scan_reselect_sql_lexer(PsqlScanState state);
8787

8888
extern bool psql_scan_in_quote(PsqlScanState state);
8989

90+
extern void psql_scan_get_location(PsqlScanState state,
91+
int *lineno, int *offset);
92+
9093
#endif /* PSQLSCAN_H */

src/include/fe_utils/psqlscan_int.h

+4
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ typedef struct PsqlScanStateData
104104
const char *curline; /* actual flex input string for cur buf */
105105
const char *refline; /* original data for cur buffer */
106106

107+
/* status for psql_scan_get_location() */
108+
int cur_line_no; /* current line#, or 0 if no yylex done */
109+
const char *cur_line_ptr; /* points into cur_line_no'th line in scanbuf */
110+
107111
/*
108112
* All this state lives across successive input lines, until explicitly
109113
* reset by psql_scan_reset. start_state is adopted by yylex() on entry,

0 commit comments

Comments
 (0)