Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 3311ea8

Browse files
committed
Introduce a non-recursive JSON parser
This parser uses an explicit prediction stack, unlike the present recursive descent parser where the parser state is represented on the call stack. This difference makes the new parser suitable for use in incremental parsing of huge JSON documents that cannot be conveniently handled piece-wise by the recursive descent parser. One potential use for this will be in parsing large backup manifests associated with incremental backups. Because this parser is somewhat slower than the recursive descent parser, it is not replacing that parser, but is an additional parser available to callers. For testing purposes, if the build is done with -DFORCE_JSON_PSTACK, all JSON parsing is done with the non-recursive parser, in which case only trivial regression differences in error messages should be observed. Author: Andrew Dunstan Reviewed-By: Jacob Champion Discussion: https://postgr.es/m/7b0a51d6-0d9d-7366-3a1a-f74397a02f55@dunslane.net
1 parent 585df02 commit 3311ea8

16 files changed

+21563
-9
lines changed

src/common/jsonapi.c

+945-9
Large diffs are not rendered by default.

src/include/common/jsonapi.h

+30
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ typedef enum JsonTokenType
3636
typedef enum JsonParseErrorType
3737
{
3838
JSON_SUCCESS,
39+
JSON_INCOMPLETE,
40+
JSON_INVALID_LEXER_TYPE,
41+
JSON_NESTING_TOO_DEEP,
3942
JSON_ESCAPING_INVALID,
4043
JSON_ESCAPING_REQUIRED,
4144
JSON_EXPECTED_ARRAY_FIRST,
@@ -57,6 +60,9 @@ typedef enum JsonParseErrorType
5760
JSON_SEM_ACTION_FAILED, /* error should already be reported */
5861
} JsonParseErrorType;
5962

63+
/* Parser state private to jsonapi.c */
64+
typedef struct JsonParserStack JsonParserStack;
65+
typedef struct JsonIncrementalState JsonIncrementalState;
6066

6167
/*
6268
* All the fields in this structure should be treated as read-only.
@@ -71,6 +77,11 @@ typedef enum JsonParseErrorType
7177
* AFTER the end of the token, i.e. where there would be a nul byte
7278
* if we were using nul-terminated strings.
7379
*
80+
* The prev_token_terminator field should not be used when incremental is
81+
* true, as the previous token might have started in a previous piece of input,
82+
* and thus it can't be used in any pointer arithmetic or other operations in
83+
* conjunction with token_start.
84+
*
7485
* JSONLEX_FREE_STRUCT/STRVAL are used to drive freeJsonLexContext.
7586
*/
7687
#define JSONLEX_FREE_STRUCT (1 << 0)
@@ -83,11 +94,14 @@ typedef struct JsonLexContext
8394
char *token_start;
8495
char *token_terminator;
8596
char *prev_token_terminator;
97+
bool incremental;
8698
JsonTokenType token_type;
8799
int lex_level;
88100
bits32 flags;
89101
int line_number; /* line number, starting from 1 */
90102
char *line_start; /* where that line starts within input */
103+
JsonParserStack *pstack;
104+
JsonIncrementalState *inc_state;
91105
StringInfo strval;
92106
StringInfo errormsg;
93107
} JsonLexContext;
@@ -141,6 +155,12 @@ typedef struct JsonSemAction
141155
extern JsonParseErrorType pg_parse_json(JsonLexContext *lex,
142156
JsonSemAction *sem);
143157

158+
extern JsonParseErrorType pg_parse_json_incremental(JsonLexContext *lex,
159+
JsonSemAction *sem,
160+
char *json,
161+
int len,
162+
bool is_last);
163+
144164
/* the null action object used for pure validation */
145165
extern PGDLLIMPORT JsonSemAction nullSemAction;
146166

@@ -176,6 +196,16 @@ extern JsonLexContext *makeJsonLexContextCstringLen(JsonLexContext *lex,
176196
int len,
177197
int encoding,
178198
bool need_escapes);
199+
200+
/*
201+
* make a JsonLexContext suitable for incremental parsing.
202+
* the string chunks will be handed to pg_parse_json_incremental,
203+
* so there's no need for them here.
204+
*/
205+
extern JsonLexContext *makeJsonLexContextIncremental(JsonLexContext *lex,
206+
int encoding,
207+
bool need_escapes);
208+
179209
extern void freeJsonLexContext(JsonLexContext *lex);
180210

181211
/* lex one token */

src/include/pg_config_manual.h

+7
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,13 @@
240240
*------------------------------------------------------------------------
241241
*/
242242

243+
/*
244+
* Force use of the non-recursive JSON parser in all cases. This is useful
245+
* to validate the working of the parser, and the regression tests should
246+
* pass except for some different error messages about the stack limit.
247+
*/
248+
/* #define FORCE_JSON_PSTACK */
249+
243250
/*
244251
* Include Valgrind "client requests", mostly in the memory allocator, so
245252
* Valgrind understands PostgreSQL memory contexts. This permits detecting

src/test/modules/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ SUBDIRS = \
2222
test_extensions \
2323
test_ginpostinglist \
2424
test_integerset \
25+
test_json_parser \
2526
test_lfind \
2627
test_misc \
2728
test_oat_hooks \

src/test/modules/meson.build

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ subdir('test_dsm_registry')
2121
subdir('test_extensions')
2222
subdir('test_ginpostinglist')
2323
subdir('test_integerset')
24+
subdir('test_json_parser')
2425
subdir('test_lfind')
2526
subdir('test_misc')
2627
subdir('test_oat_hooks')
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
2+
PGFILEDESC = "standalone json parser tester"
3+
PGAPPICON = win32
4+
5+
TAP_TESTS = 1
6+
7+
OBJS = test_json_parser_incremental.o test_json_parser_perf.o
8+
9+
ifdef USE_PGXS
10+
PG_CONFIG = pg_config
11+
PGXS := $(shell $(PG_CONFIG) --pgxs)
12+
include $(PGXS)
13+
else
14+
subdir = src/test/modules/test_json_parser
15+
top_builddir = ../../../..
16+
include $(top_builddir)/src/Makefile.global
17+
include $(top_srcdir)/contrib/contrib-global.mk
18+
endif
19+
20+
all: test_json_parser_incremental$(X) test_json_parser_perf$(X)
21+
22+
%.o: $(top_srcdir)/$(subdir)/%.c
23+
24+
PARSER_LIBS = $(top_builddir)/src/common/libpgcommon.a $(top_builddir)/src/port/libpgport.a
25+
26+
test_json_parser_incremental$(X): test_json_parser_incremental.o $(PARSER_LIBS)
27+
$(CC) $(CFLAGS) $^ -o $@
28+
29+
test_json_parser_perf$(X): test_json_parser_perf.o $(PARSER_LIBS)
30+
$(CC) $(CFLAGS) $^ -o $@
31+
32+
speed-check: test_json_parser_perf$(X)
33+
@echo Standard parser:
34+
time ./test_json_parser_perf 10000 $(top_srcdir)/$(subdir)/tiny.json
35+
@echo Incremental parser:
36+
time ./test_json_parser_perf -i 10000 $(top_srcdir)/$(subdir)/tiny.json
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
Module `test_json_parser`
2+
=========================
3+
4+
This module contains two programs for testing the json parsers.
5+
6+
- `test_json_parser_incremental` is for testing the incremental parser, It
7+
reads in a file and pases it in very small chunks (60 bytes at a time) to
8+
the incremental parser. It's not meant to be a speed test but to test the
9+
accuracy of the incremental parser. It takes one argument: the name of the
10+
input file.
11+
- `test_json_parser_perf` is for speed testing both the standard
12+
recursive descent parser and the non-recursive incremental
13+
parser. If given the `-i` flag it uses the non-recursive parser,
14+
otherwise the stardard parser. The remaining flags are the number of
15+
parsing iterations and the file containing the input. Even when
16+
using the non-recursive parser, the input is passed to the parser in a
17+
single chunk. The results are thus comparable to those of the
18+
standard parser.
19+
20+
The easiest way to use these is to run `make check` and `make speed-check`
21+
22+
The sample input file is a small extract from a list of `delicious`
23+
bookmarks taken some years ago, all wrapped in a single json
24+
array. 10,000 iterations of parsing this file gives a reasonable
25+
benchmark, and that is what the `speed-check` target does.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright (c) 2024, PostgreSQL Global Development Group
2+
3+
test_json_parser_incremental_sources = files(
4+
'test_json_parser_incremental.c',
5+
)
6+
7+
if host_system == 'windows'
8+
test_json_parser_incremental_sources += rc_bin_gen.process(win32ver_rc, extra_args: [
9+
'--NAME', 'test_json_parser_incremental',
10+
'--FILEDESC', 'standalone json parser tester',
11+
])
12+
endif
13+
14+
test_json_parser_incremental = executable('test_json_parser_incremental',
15+
test_json_parser_incremental_sources,
16+
dependencies: [frontend_code],
17+
kwargs: default_bin_args + {
18+
'install': false,
19+
},
20+
)
21+
22+
test_json_parser_perf_sources = files(
23+
'test_json_parser_perf.c',
24+
)
25+
26+
if host_system == 'windows'
27+
test_json_parser_perf_sources += rc_bin_gen.process(win32ver_rc, extra_args: [
28+
'--NAME', 'test_json_parser_perf',
29+
'--FILEDESC', 'standalone json parser tester',
30+
])
31+
endif
32+
33+
test_json_parser_perf = executable('test_json_parser_perf',
34+
test_json_parser_perf_sources,
35+
dependencies: [frontend_code],
36+
kwargs: default_bin_args + {
37+
'install': false,
38+
},
39+
)
40+
41+
tests += {
42+
'name': 'test_json_parser',
43+
'sd': meson.current_source_dir(),
44+
'bd': meson.current_build_dir(),
45+
'tap': {
46+
'tests': [
47+
't/001_test_json_parser_incremental.pl',
48+
't/002_inline.pl',
49+
't/003_test_semantic.pl'
50+
],
51+
},
52+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2+
use strict;
3+
use warnings;
4+
5+
use PostgreSQL::Test::Utils;
6+
use Test::More;
7+
use FindBin;
8+
9+
use File::Temp qw(tempfile);
10+
11+
my $test_file = "$FindBin::RealBin/../tiny.json";
12+
13+
my $exe = "test_json_parser_incremental";
14+
15+
for (my $size = 64; $size > 0; $size--)
16+
{
17+
my ($stdout, $stderr) = run_command( [$exe, "-c", $size, $test_file] );
18+
19+
like($stdout, qr/SUCCESS/, "chunk size $size: test succeeds");
20+
is($stderr, "", "chunk size $size: no error output");
21+
}
22+
23+
done_testing();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
use strict;
2+
use warnings;
3+
4+
use PostgreSQL::Test::Utils;
5+
use Test::More;
6+
7+
use File::Temp qw(tempfile);
8+
9+
sub test
10+
{
11+
local $Test::Builder::Level = $Test::Builder::Level + 1;
12+
13+
my ($name, $json, %params) = @_;
14+
my $exe = "test_json_parser_incremental";
15+
my $chunk = length($json);
16+
17+
if ($chunk > 64)
18+
{
19+
$chunk = 64;
20+
}
21+
22+
my ($fh, $fname) = tempfile(UNLINK => 1);
23+
print $fh "$json";
24+
close($fh);
25+
26+
foreach my $size (reverse(1..$chunk))
27+
{
28+
my ($stdout, $stderr) = run_command( [$exe, "-c", $size, $fname] );
29+
30+
if (defined($params{error}))
31+
{
32+
unlike($stdout, qr/SUCCESS/, "$name, chunk size $size: test fails");
33+
like($stderr, $params{error}, "$name, chunk size $size: correct error output");
34+
}
35+
else
36+
{
37+
like($stdout, qr/SUCCESS/, "$name, chunk size $size: test succeeds");
38+
is($stderr, "", "$name, chunk size $size: no error output");
39+
}
40+
}
41+
}
42+
43+
test("number", "12345");
44+
test("string", '"hello"');
45+
test("false", "false");
46+
test("true", "true");
47+
test("null", "null");
48+
test("empty object", "{}");
49+
test("empty array", "[]");
50+
test("array with number", "[12345]");
51+
test("array with numbers", "[12345,67890]");
52+
test("array with null", "[null]");
53+
test("array with string", '["hello"]');
54+
test("array with boolean", '[false]');
55+
test("single pair", '{"key": "value"}');
56+
test("heavily nested array", "[" x 3200 . "]" x 3200);
57+
test("serial escapes", '"\\\\\\\\\\\\\\\\"');
58+
test("interrupted escapes", '"\\\\\\"\\\\\\\\\\"\\\\"');
59+
test("whitespace", ' "" ');
60+
61+
test("unclosed empty object", "{", error => qr/input string ended unexpectedly/);
62+
test("bad key", "{{", error => qr/Expected string or "}", but found "\{"/);
63+
test("bad key", "{{}", error => qr/Expected string or "}", but found "\{"/);
64+
test("numeric key", "{1234: 2}", error => qr/Expected string or "}", but found "1234"/);
65+
test("second numeric key", '{"a": "a", 1234: 2}', error => qr/Expected string, but found "1234"/);
66+
test("unclosed object with pair", '{"key": "value"', error => qr/input string ended unexpectedly/);
67+
test("missing key value", '{"key": }', error => qr/Expected JSON value, but found "}"/);
68+
test("missing colon", '{"key" 12345}', error => qr/Expected ":", but found "12345"/);
69+
test("missing comma", '{"key": 12345 12345}', error => qr/Expected "," or "}", but found "12345"/);
70+
test("overnested array", "[" x 6401, error => qr/maximum permitted depth is 6400/);
71+
test("overclosed array", "[]]", error => qr/Expected end of input, but found "]"/);
72+
test("unexpected token in array", "[ }}} ]", error => qr/Expected array element or "]", but found "}"/);
73+
test("junk punctuation", "[ ||| ]", error => qr/Token "|" is invalid/);
74+
test("missing comma in array", "[123 123]", error => qr/Expected "," or "]", but found "123"/);
75+
test("misspelled boolean", "tru", error => qr/Token "tru" is invalid/);
76+
test("misspelled boolean in array", "[tru]", error => qr/Token "tru" is invalid/);
77+
test("smashed top-level scalar", "12zz", error => qr/Token "12zz" is invalid/);
78+
test("smashed scalar in array", "[12zz]", error => qr/Token "12zz" is invalid/);
79+
test("unknown escape sequence", '"hello\vworld"', error => qr/Escape sequence "\\v" is invalid/);
80+
test("unescaped control", "\"hello\tworld\"", error => qr/Character with value 0x09 must be escaped/);
81+
test("incorrect escape count", '"\\\\\\\\\\\\\\"', error => qr/Token ""\\\\\\\\\\\\\\"" is invalid/);
82+
83+
done_testing();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
use strict;
2+
use warnings;
3+
4+
use PostgreSQL::Test::Utils;
5+
use Test::More;
6+
use FindBin;
7+
8+
use File::Temp qw(tempfile);
9+
10+
my $test_file = "$FindBin::RealBin/../tiny.json";
11+
my $test_out = "$FindBin::RealBin/../tiny.out";
12+
13+
my $exe = "test_json_parser_incremental";
14+
15+
my ($stdout, $stderr) = run_command( [$exe, "-s", $test_file] );
16+
17+
is($stderr, "", "no error output");
18+
19+
my ($fh, $fname) = tempfile();
20+
21+
print $fh $stdout,"\n";
22+
23+
close($fh);
24+
25+
($stdout, $stderr) = run_command(["diff", "-u", $fname, $test_out]);
26+
27+
is($stdout, "", "no output diff");
28+
is($stderr, "", "no diff error");
29+
30+
done_testing();
31+
32+
33+
34+
35+
36+

0 commit comments

Comments
 (0)