Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 03e0bc1

Browse files
committed
Repair bug in regexp split performance improvements.
Commit c8ea87e introduced a temporary conversion buffer for substrings extracted during regexp splits. Unfortunately the code that sized it was failing to ignore the effects of ignored degenerate regexp matches, so for regexp_split_* calls it could under-size the buffer in such cases. Fix, and add some regression test cases (though those will only catch the bug if run in a multibyte encoding). Backpatch to 9.3 as the faulty code was. Thanks to the PostGIS project, Regina Obe and Paul Ramsey for the report (via IRC) and assistance in analysis. Patch by me.
1 parent 84a3a1e commit 03e0bc1

File tree

3 files changed

+31
-6
lines changed

3 files changed

+31
-6
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
936936
int array_len;
937937
int array_idx;
938938
int prev_match_end;
939+
int prev_valid_match_end;
939940
int start_search;
940941
int maxlen = 0; /* largest fetch length in characters */
941942

@@ -991,6 +992,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
991992

992993
/* search for the pattern, perhaps repeatedly */
993994
prev_match_end = 0;
995+
prev_valid_match_end = 0;
994996
start_search = 0;
995997
while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
996998
pmatch_len, pmatch))
@@ -1043,13 +1045,15 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
10431045
matchctx->nmatches++;
10441046

10451047
/*
1046-
* check length of unmatched portion between end of previous match
1047-
* and start of current one
1048+
* check length of unmatched portion between end of previous valid
1049+
* (nondegenerate, or degenerate but not ignored) match and start
1050+
* of current one
10481051
*/
10491052
if (fetching_unmatched &&
10501053
pmatch[0].rm_so >= 0 &&
1051-
(pmatch[0].rm_so - prev_match_end) > maxlen)
1052-
maxlen = (pmatch[0].rm_so - prev_match_end);
1054+
(pmatch[0].rm_so - prev_valid_match_end) > maxlen)
1055+
maxlen = (pmatch[0].rm_so - prev_valid_match_end);
1056+
prev_valid_match_end = pmatch[0].rm_eo;
10531057
}
10541058
prev_match_end = pmatch[0].rm_eo;
10551059

@@ -1075,8 +1079,8 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
10751079
* input string
10761080
*/
10771081
if (fetching_unmatched &&
1078-
(wide_len - prev_match_end) > maxlen)
1079-
maxlen = (wide_len - prev_match_end);
1082+
(wide_len - prev_valid_match_end) > maxlen)
1083+
maxlen = (wide_len - prev_valid_match_end);
10801084

10811085
/*
10821086
* Keep a note of the end position of the string for the benefit of

src/test/regress/expected/strings.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,24 @@ SELECT regexp_split_to_array('123456','.');
674674
{"","","","","","",""}
675675
(1 row)
676676

677+
SELECT regexp_split_to_array('123456','');
678+
regexp_split_to_array
679+
-----------------------
680+
{1,2,3,4,5,6}
681+
(1 row)
682+
683+
SELECT regexp_split_to_array('123456','(?:)');
684+
regexp_split_to_array
685+
-----------------------
686+
{1,2,3,4,5,6}
687+
(1 row)
688+
689+
SELECT regexp_split_to_array('1','');
690+
regexp_split_to_array
691+
-----------------------
692+
{1}
693+
(1 row)
694+
677695
-- errors
678696
SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
679697
ERROR: invalid regexp option: "z"

src/test/regress/sql/strings.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@ SELECT regexp_split_to_array('the quick brown fox jumps over the lazy dog', 'nom
188188
SELECT regexp_split_to_array('123456','1');
189189
SELECT regexp_split_to_array('123456','6');
190190
SELECT regexp_split_to_array('123456','.');
191+
SELECT regexp_split_to_array('123456','');
192+
SELECT regexp_split_to_array('123456','(?:)');
193+
SELECT regexp_split_to_array('1','');
191194
-- errors
192195
SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'zippy') AS foo;
193196
SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPs ovEr The lazy dOG', 'e', 'iz');

0 commit comments

Comments
 (0)