Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c8ea87e

Browse files
committed
Avoid quadratic slowdown in regexp match/split functions.
regexp_matches, regexp_split_to_table and regexp_split_to_array all work by compiling a list of match positions as character offsets (NOT byte positions) in the source string. Formerly, they then used text_substr to extract the matched text; but in a multi-byte encoding, that counts the characters in the string, and the characters needed to reach the starting byte position, on every call. Accordingly, the performance degraded as the product of the input string length and the number of match positions, such that splitting a string of a few hundred kbytes could take many minutes. Repair by keeping the wide-character copy of the input string available (only in the case where encoding_max_length is not 1) after performing the match operation, and extracting substrings from that instead. This reduces the complexity to being linear in the number of result bytes, discounting the actual regexp match itself (which is not affected by this patch). In passing, remove cleanup using retail pfree() which was obsoleted by commit ff428cd (Feb 2008) which made cleanup of SRF multi-call contexts automatic. Also increase (to ~134 million) the maximum number of matches and provide an error message when it is reached. Backpatch all the way because this has been wrong forever. Analysis and patch by me; review by Kaiting Chen. Discussion: https://postgr.es/m/87pnyn55qh.fsf@news-spur.riddles.org.uk see also https://postgr.es/m/87lg996g4r.fsf@news-spur.riddles.org.uk
1 parent 3e2ceb2 commit c8ea87e

File tree

1 file changed

+129
-53
lines changed

1 file changed

+129
-53
lines changed

src/backend/utils/adt/regexp.c

+129-53
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "regex/regex.h"
3636
#include "utils/array.h"
3737
#include "utils/builtins.h"
38+
#include "utils/memutils.h"
3839
#include "utils/varlena.h"
3940

4041
#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
@@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx
6162
/* workspace for build_regexp_match_result() */
6263
Datum *elems; /* has npatterns elements */
6364
bool *nulls; /* has npatterns elements */
65+
pg_wchar *wide_str; /* wide-char version of original string */
66+
char *conv_buf; /* conversion buffer */
67+
int conv_bufsiz; /* size thereof */
6468
} regexp_matches_ctx;
6569

6670
/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111115
pg_re_flags *flags,
112116
Oid collation,
113117
bool use_subpatterns,
114-
bool ignore_degenerate);
115-
static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
118+
bool ignore_degenerate,
119+
bool fetching_unmatched);
116120
static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
117121
static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
118122

@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
863867
errhint("Use the regexp_matches function instead.")));
864868

865869
matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
866-
PG_GET_COLLATION(), true, false);
870+
PG_GET_COLLATION(), true, false, false);
867871

868872
if (matchctx->nmatches == 0)
869873
PG_RETURN_NULL();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
911915
matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
912916
&re_flags,
913917
PG_GET_COLLATION(),
914-
true, false);
918+
true, false, false);
915919

916920
/* Pre-create workspace that build_regexp_match_result needs */
917921
matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS)
933937
SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
934938
}
935939

936-
/* release space in multi-call ctx to avoid intraquery memory leak */
937-
cleanup_regexp_matches(matchctx);
938-
939940
SRF_RETURN_DONE(funcctx);
940941
}
941942

@@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
954955
* all the matching in one swoop. The returned regexp_matches_ctx contains
955956
* the locations of all the substrings matching the pattern.
956957
*
957-
* The two bool parameters have only two patterns (one for matching, one for
958+
* The three bool parameters have only two patterns (one for matching, one for
958959
* splitting) but it seems clearer to distinguish the functionality this way
959-
* than to key it all off one "is_split" flag.
960+
* than to key it all off one "is_split" flag. We don't currently assume that
961+
* fetching_unmatched is exclusive of fetching the matched text too; if it's
962+
* set, the conversion buffer is large enough to fetch any single matched or
963+
* unmatched string, but not any larger substring. (In practice, when splitting
964+
* the matches are usually small anyway, and it didn't seem worth complicating
965+
* the code further.)
960966
*/
961967
static regexp_matches_ctx *
962968
setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
963969
Oid collation,
964970
bool use_subpatterns,
965-
bool ignore_degenerate)
971+
bool ignore_degenerate,
972+
bool fetching_unmatched)
966973
{
967974
regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
975+
int eml = pg_database_encoding_max_length();
968976
int orig_len;
969977
pg_wchar *wide_str;
970978
int wide_len;
@@ -975,6 +983,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
975983
int array_idx;
976984
int prev_match_end;
977985
int start_search;
986+
int maxlen = 0; /* largest fetch length in characters */
978987

979988
/* save original string --- we'll extract result substrings from it */
980989
matchctx->orig_str = orig_str;
@@ -1003,8 +1012,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10031012
/* temporary output space for RE package */
10041013
pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
10051014

1006-
/* the real output space (grown dynamically if needed) */
1007-
array_len = re_flags->glob ? 256 : 32;
1015+
/*
1016+
* the real output space (grown dynamically if needed)
1017+
*
1018+
* use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
1019+
* than at 2^27
1020+
*/
1021+
array_len = re_flags->glob ? 255 : 31;
10081022
matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
10091023
array_idx = 0;
10101024

@@ -1024,9 +1038,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10241038
pmatch[0].rm_eo > prev_match_end))
10251039
{
10261040
/* enlarge output space if needed */
1027-
while (array_idx + matchctx->npatterns * 2 > array_len)
1041+
while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
10281042
{
1029-
array_len *= 2;
1043+
array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
1044+
if (array_len > MaxAllocSize/sizeof(int))
1045+
ereport(ERROR,
1046+
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1047+
errmsg("too many regular expression matches")));
10301048
matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
10311049
sizeof(int) * array_len);
10321050
}
@@ -1038,16 +1056,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10381056

10391057
for (i = 1; i <= matchctx->npatterns; i++)
10401058
{
1041-
matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
1042-
matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
1059+
int so = pmatch[i].rm_so;
1060+
int eo = pmatch[i].rm_eo;
1061+
matchctx->match_locs[array_idx++] = so;
1062+
matchctx->match_locs[array_idx++] = eo;
1063+
if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
1064+
maxlen = (eo - so);
10431065
}
10441066
}
10451067
else
10461068
{
1047-
matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
1048-
matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
1069+
int so = pmatch[0].rm_so;
1070+
int eo = pmatch[0].rm_eo;
1071+
matchctx->match_locs[array_idx++] = so;
1072+
matchctx->match_locs[array_idx++] = eo;
1073+
if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
1074+
maxlen = (eo - so);
10491075
}
10501076
matchctx->nmatches++;
1077+
1078+
/*
1079+
* check length of unmatched portion between end of previous match
1080+
* and start of current one
1081+
*/
1082+
if (fetching_unmatched &&
1083+
pmatch[0].rm_so >= 0 &&
1084+
(pmatch[0].rm_so - prev_match_end) > maxlen)
1085+
maxlen = (pmatch[0].rm_so - prev_match_end);
10511086
}
10521087
prev_match_end = pmatch[0].rm_eo;
10531088

@@ -1068,34 +1103,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
10681103
break;
10691104
}
10701105

1106+
/*
1107+
* check length of unmatched portion between end of last match and end of
1108+
* input string
1109+
*/
1110+
if (fetching_unmatched &&
1111+
(wide_len - prev_match_end) > maxlen)
1112+
maxlen = (wide_len - prev_match_end);
1113+
1114+
/*
1115+
* Keep a note of the end position of the string for the benefit of
1116+
* splitting code.
1117+
*/
1118+
matchctx->match_locs[array_idx] = wide_len;
1119+
1120+
if (eml > 1)
1121+
{
1122+
int64 maxsiz = eml * (int64) maxlen;
1123+
int conv_bufsiz;
1124+
1125+
/*
1126+
* Make the conversion buffer large enough for any substring of
1127+
* interest.
1128+
*
1129+
* Worst case: assume we need the maximum size (maxlen*eml), but take
1130+
* advantage of the fact that the original string length in bytes is an
1131+
* upper bound on the byte length of any fetched substring (and we know
1132+
* that len+1 is safe to allocate because the varlena header is longer
1133+
* than 1 byte).
1134+
*/
1135+
if (maxsiz > orig_len)
1136+
conv_bufsiz = orig_len + 1;
1137+
else
1138+
conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */
1139+
1140+
matchctx->conv_buf = palloc(conv_bufsiz);
1141+
matchctx->conv_bufsiz = conv_bufsiz;
1142+
matchctx->wide_str = wide_str;
1143+
}
1144+
else
1145+
{
1146+
/* No need to keep the wide string if we're in a single-byte charset. */
1147+
pfree(wide_str);
1148+
matchctx->wide_str = NULL;
1149+
matchctx->conv_buf = NULL;
1150+
matchctx->conv_bufsiz = 0;
1151+
}
1152+
10711153
/* Clean up temp storage */
1072-
pfree(wide_str);
10731154
pfree(pmatch);
10741155

10751156
return matchctx;
10761157
}
10771158

1078-
/*
1079-
* cleanup_regexp_matches - release memory of a regexp_matches_ctx
1080-
*/
1081-
static void
1082-
cleanup_regexp_matches(regexp_matches_ctx *matchctx)
1083-
{
1084-
pfree(matchctx->orig_str);
1085-
pfree(matchctx->match_locs);
1086-
if (matchctx->elems)
1087-
pfree(matchctx->elems);
1088-
if (matchctx->nulls)
1089-
pfree(matchctx->nulls);
1090-
pfree(matchctx);
1091-
}
1092-
10931159
/*
10941160
* build_regexp_match_result - build output array for current match
10951161
*/
10961162
static ArrayType *
10971163
build_regexp_match_result(regexp_matches_ctx *matchctx)
10981164
{
1165+
char *buf = matchctx->conv_buf;
1166+
int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
10991167
Datum *elems = matchctx->elems;
11001168
bool *nulls = matchctx->nulls;
11011169
int dims[1];
@@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
11151183
elems[i] = (Datum) 0;
11161184
nulls[i] = true;
11171185
}
1186+
else if (buf)
1187+
{
1188+
int len = pg_wchar2mb_with_len(matchctx->wide_str + so,
1189+
buf,
1190+
eo - so);
1191+
Assert(len < bufsiz);
1192+
elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
1193+
nulls[i] = false;
1194+
}
11181195
else
11191196
{
11201197
elems[i] = DirectFunctionCall3(text_substr,
@@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11681245
splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
11691246
&re_flags,
11701247
PG_GET_COLLATION(),
1171-
false, true);
1248+
false, true, true);
11721249

11731250
MemoryContextSwitchTo(oldcontext);
11741251
funcctx->user_fctx = (void *) splitctx;
@@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11851262
SRF_RETURN_NEXT(funcctx, result);
11861263
}
11871264

1188-
/* release space in multi-call ctx to avoid intraquery memory leak */
1189-
cleanup_regexp_matches(splitctx);
1190-
11911265
SRF_RETURN_DONE(funcctx);
11921266
}
11931267

@@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
12241298
PG_GETARG_TEXT_PP(1),
12251299
&re_flags,
12261300
PG_GET_COLLATION(),
1227-
false, true);
1301+
false, true, true);
12281302

12291303
while (splitctx->next_match <= splitctx->nmatches)
12301304
{
@@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
12361310
splitctx->next_match++;
12371311
}
12381312

1239-
/*
1240-
* We don't call cleanup_regexp_matches here; it would try to pfree the
1241-
* input string, which we didn't copy. The space is not in a long-lived
1242-
* memory context anyway.
1243-
*/
1244-
12451313
PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
12461314
}
12471315

@@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
12611329
static Datum
12621330
build_regexp_split_result(regexp_matches_ctx *splitctx)
12631331
{
1332+
char *buf = splitctx->conv_buf;
12641333
int startpos;
12651334
int endpos;
12661335

@@ -1271,22 +1340,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
12711340
if (startpos < 0)
12721341
elog(ERROR, "invalid match ending position");
12731342

1274-
if (splitctx->next_match < splitctx->nmatches)
1343+
if (buf)
12751344
{
1345+
int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
1346+
int len;
1347+
12761348
endpos = splitctx->match_locs[splitctx->next_match * 2];
12771349
if (endpos < startpos)
12781350
elog(ERROR, "invalid match starting position");
1279-
return DirectFunctionCall3(text_substr,
1280-
PointerGetDatum(splitctx->orig_str),
1281-
Int32GetDatum(startpos + 1),
1282-
Int32GetDatum(endpos - startpos));
1351+
len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
1352+
buf,
1353+
endpos-startpos);
1354+
Assert(len < bufsiz);
1355+
return PointerGetDatum(cstring_to_text_with_len(buf, len));
12831356
}
12841357
else
12851358
{
1286-
/* no more matches, return rest of string */
1287-
return DirectFunctionCall2(text_substr_no_len,
1359+
endpos = splitctx->match_locs[splitctx->next_match * 2];
1360+
if (endpos < startpos)
1361+
elog(ERROR, "invalid match starting position");
1362+
return DirectFunctionCall3(text_substr,
12881363
PointerGetDatum(splitctx->orig_str),
1289-
Int32GetDatum(startpos + 1));
1364+
Int32GetDatum(startpos + 1),
1365+
Int32GetDatum(endpos - startpos));
12901366
}
12911367
}
12921368

0 commit comments

Comments
 (0)