Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 450b247

Browse files
committed
Avoid quadratic slowdown in regexp match/split functions.
regexp_matches, regexp_split_to_table and regexp_split_to_array all work by compiling a list of match positions as character offsets (NOT byte positions) in the source string. Formerly, they then used text_substr to extract the matched text; but in a multi-byte encoding, that counts the characters in the string, and the characters needed to reach the starting byte position, on every call. Accordingly, the performance degraded as the product of the input string length and the number of match positions, such that splitting a string of a few hundred kbytes could take many minutes. Repair by keeping the wide-character copy of the input string available (only in the case where encoding_max_length is not 1) after performing the match operation, and extracting substrings from that instead. This reduces the complexity to being linear in the number of result bytes, discounting the actual regexp match itself (which is not affected by this patch). In passing, remove cleanup using retail pfree() which was obsoleted by commit ff428cd (Feb 2008) which made cleanup of SRF multi-call contexts automatic. Also increase (to ~134 million) the maximum number of matches and provide an error message when it is reached. Backpatch all the way because this has been wrong forever. Analysis and patch by me; review by Kaiting Chen. Discussion: https://postgr.es/m/87pnyn55qh.fsf@news-spur.riddles.org.uk see also https://postgr.es/m/87lg996g4r.fsf@news-spur.riddles.org.uk
1 parent 173df4c commit 450b247

File tree

1 file changed

+131
-54
lines changed

1 file changed

+131
-54
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 131 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#include "regex/regex.h"
3636
#include "utils/array.h"
3737
#include "utils/builtins.h"
38+
#include "utils/memutils.h"
3839

3940
#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
4041
(PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
@@ -60,6 +61,9 @@ typedef struct regexp_matches_ctx
6061
/* workspace for build_regexp_matches_result() */
6162
Datum *elems; /* has npatterns elements */
6263
bool *nulls; /* has npatterns elements */
64+
pg_wchar *wide_str; /* wide-char version of original string */
65+
char *conv_buf; /* conversion buffer */
66+
int conv_bufsiz; /* size thereof */
6367
} regexp_matches_ctx;
6468

6569
/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111115
Oid collation,
112116
bool force_glob,
113117
bool use_subpatterns,
114-
bool ignore_degenerate);
115-
static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
118+
bool ignore_degenerate,
119+
bool fetching_unmatched);
116120
static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
117121
static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
118122

@@ -863,7 +867,7 @@ regexp_matches(PG_FUNCTION_ARGS)
863867
matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
864868
flags,
865869
PG_GET_COLLATION(),
866-
false, true, false);
870+
false, true, false, false);
867871

868872
/* Pre-create workspace that build_regexp_matches_result needs */
869873
matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
@@ -885,9 +889,6 @@ regexp_matches(PG_FUNCTION_ARGS)
885889
SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
886890
}
887891

888-
/* release space in multi-call ctx to avoid intraquery memory leak */
889-
cleanup_regexp_matches(matchctx);
890-
891892
SRF_RETURN_DONE(funcctx);
892893
}
893894

@@ -906,17 +907,25 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
906907
* all the matching in one swoop. The returned regexp_matches_ctx contains
907908
* the locations of all the substrings matching the pattern.
908909
*
909-
* The three bool parameters have only two patterns (one for each caller)
910-
* but it seems clearer to distinguish the functionality this way than to
911-
* key it all off one "is_split" flag.
910+
* The four bool parameters have only two patterns (one for matching, one for
911+
* splitting) but it seems clearer to distinguish the functionality this way
912+
* than to key it all off one "is_split" flag. We don't currently assume that
913+
* fetching_unmatched is exclusive of fetching the matched text too; if it's
914+
* set, the conversion buffer is large enough to fetch any single matched or
915+
* unmatched string, but not any larger substring. (In practice, when splitting
916+
* the matches are usually small anyway, and it didn't seem worth complicating
917+
* the code further.)
912918
*/
913919
static regexp_matches_ctx *
914920
setup_regexp_matches(text *orig_str, text *pattern, text *flags,
915921
Oid collation,
916-
bool force_glob, bool use_subpatterns,
917-
bool ignore_degenerate)
922+
bool force_glob,
923+
bool use_subpatterns,
924+
bool ignore_degenerate,
925+
bool fetching_unmatched)
918926
{
919927
regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
928+
int eml = pg_database_encoding_max_length();
920929
int orig_len;
921930
pg_wchar *wide_str;
922931
int wide_len;
@@ -928,6 +937,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
928937
int array_idx;
929938
int prev_match_end;
930939
int start_search;
940+
int maxlen = 0; /* largest fetch length in characters */
931941

932942
/* save original string --- we'll extract result substrings from it */
933943
matchctx->orig_str = orig_str;
@@ -969,8 +979,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
969979
/* temporary output space for RE package */
970980
pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
971981

972-
/* the real output space (grown dynamically if needed) */
973-
array_len = re_flags.glob ? 256 : 32;
982+
/*
983+
* the real output space (grown dynamically if needed)
984+
*
985+
* use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
986+
* than at 2^27
987+
*/
988+
array_len = re_flags.glob ? 255 : 31;
974989
matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
975990
array_idx = 0;
976991

@@ -990,9 +1005,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
9901005
pmatch[0].rm_eo > prev_match_end))
9911006
{
9921007
/* enlarge output space if needed */
993-
while (array_idx + matchctx->npatterns * 2 > array_len)
1008+
while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
9941009
{
995-
array_len *= 2;
1010+
array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
1011+
if (array_len > MaxAllocSize/sizeof(int))
1012+
ereport(ERROR,
1013+
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1014+
errmsg("too many regular expression matches")));
9961015
matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
9971016
sizeof(int) * array_len);
9981017
}
@@ -1004,16 +1023,33 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
10041023

10051024
for (i = 1; i <= matchctx->npatterns; i++)
10061025
{
1007-
matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
1008-
matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
1026+
int so = pmatch[i].rm_so;
1027+
int eo = pmatch[i].rm_eo;
1028+
matchctx->match_locs[array_idx++] = so;
1029+
matchctx->match_locs[array_idx++] = eo;
1030+
if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
1031+
maxlen = (eo - so);
10091032
}
10101033
}
10111034
else
10121035
{
1013-
matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
1014-
matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
1036+
int so = pmatch[0].rm_so;
1037+
int eo = pmatch[0].rm_eo;
1038+
matchctx->match_locs[array_idx++] = so;
1039+
matchctx->match_locs[array_idx++] = eo;
1040+
if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
1041+
maxlen = (eo - so);
10151042
}
10161043
matchctx->nmatches++;
1044+
1045+
/*
1046+
* check length of unmatched portion between end of previous match
1047+
* and start of current one
1048+
*/
1049+
if (fetching_unmatched &&
1050+
pmatch[0].rm_so >= 0 &&
1051+
(pmatch[0].rm_so - prev_match_end) > maxlen)
1052+
maxlen = (pmatch[0].rm_so - prev_match_end);
10171053
}
10181054
prev_match_end = pmatch[0].rm_eo;
10191055

@@ -1034,34 +1070,67 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
10341070
break;
10351071
}
10361072

1073+
/*
1074+
* check length of unmatched portion between end of last match and end of
1075+
* input string
1076+
*/
1077+
if (fetching_unmatched &&
1078+
(wide_len - prev_match_end) > maxlen)
1079+
maxlen = (wide_len - prev_match_end);
1080+
1081+
/*
1082+
* Keep a note of the end position of the string for the benefit of
1083+
* splitting code.
1084+
*/
1085+
matchctx->match_locs[array_idx] = wide_len;
1086+
1087+
if (eml > 1)
1088+
{
1089+
int64 maxsiz = eml * (int64) maxlen;
1090+
int conv_bufsiz;
1091+
1092+
/*
1093+
* Make the conversion buffer large enough for any substring of
1094+
* interest.
1095+
*
1096+
* Worst case: assume we need the maximum size (maxlen*eml), but take
1097+
* advantage of the fact that the original string length in bytes is an
1098+
* upper bound on the byte length of any fetched substring (and we know
1099+
* that len+1 is safe to allocate because the varlena header is longer
1100+
* than 1 byte).
1101+
*/
1102+
if (maxsiz > orig_len)
1103+
conv_bufsiz = orig_len + 1;
1104+
else
1105+
conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */
1106+
1107+
matchctx->conv_buf = palloc(conv_bufsiz);
1108+
matchctx->conv_bufsiz = conv_bufsiz;
1109+
matchctx->wide_str = wide_str;
1110+
}
1111+
else
1112+
{
1113+
/* No need to keep the wide string if we're in a single-byte charset. */
1114+
pfree(wide_str);
1115+
matchctx->wide_str = NULL;
1116+
matchctx->conv_buf = NULL;
1117+
matchctx->conv_bufsiz = 0;
1118+
}
1119+
10371120
/* Clean up temp storage */
1038-
pfree(wide_str);
10391121
pfree(pmatch);
10401122

10411123
return matchctx;
10421124
}
10431125

1044-
/*
1045-
* cleanup_regexp_matches - release memory of a regexp_matches_ctx
1046-
*/
1047-
static void
1048-
cleanup_regexp_matches(regexp_matches_ctx *matchctx)
1049-
{
1050-
pfree(matchctx->orig_str);
1051-
pfree(matchctx->match_locs);
1052-
if (matchctx->elems)
1053-
pfree(matchctx->elems);
1054-
if (matchctx->nulls)
1055-
pfree(matchctx->nulls);
1056-
pfree(matchctx);
1057-
}
1058-
10591126
/*
10601127
* build_regexp_matches_result - build output array for current match
10611128
*/
10621129
static ArrayType *
10631130
build_regexp_matches_result(regexp_matches_ctx *matchctx)
10641131
{
1132+
char *buf = matchctx->conv_buf;
1133+
int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
10651134
Datum *elems = matchctx->elems;
10661135
bool *nulls = matchctx->nulls;
10671136
int dims[1];
@@ -1081,6 +1150,15 @@ build_regexp_matches_result(regexp_matches_ctx *matchctx)
10811150
elems[i] = (Datum) 0;
10821151
nulls[i] = true;
10831152
}
1153+
else if (buf)
1154+
{
1155+
int len = pg_wchar2mb_with_len(matchctx->wide_str + so,
1156+
buf,
1157+
eo - so);
1158+
Assert(len < bufsiz);
1159+
elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
1160+
nulls[i] = false;
1161+
}
10841162
else
10851163
{
10861164
elems[i] = DirectFunctionCall3(text_substr,
@@ -1123,7 +1201,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11231201
splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
11241202
flags,
11251203
PG_GET_COLLATION(),
1126-
true, false, true);
1204+
true, false, true, true);
11271205

11281206
MemoryContextSwitchTo(oldcontext);
11291207
funcctx->user_fctx = (void *) splitctx;
@@ -1140,9 +1218,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
11401218
SRF_RETURN_NEXT(funcctx, result);
11411219
}
11421220

1143-
/* release space in multi-call ctx to avoid intraquery memory leak */
1144-
cleanup_regexp_matches(splitctx);
1145-
11461221
SRF_RETURN_DONE(funcctx);
11471222
}
11481223

@@ -1168,7 +1243,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
11681243
PG_GETARG_TEXT_PP(1),
11691244
PG_GETARG_TEXT_PP_IF_EXISTS(2),
11701245
PG_GET_COLLATION(),
1171-
true, false, true);
1246+
true, false, true, true);
11721247

11731248
while (splitctx->next_match <= splitctx->nmatches)
11741249
{
@@ -1180,12 +1255,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
11801255
splitctx->next_match++;
11811256
}
11821257

1183-
/*
1184-
* We don't call cleanup_regexp_matches here; it would try to pfree the
1185-
* input string, which we didn't copy. The space is not in a long-lived
1186-
* memory context anyway.
1187-
*/
1188-
11891258
PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
11901259
}
11911260

@@ -1205,6 +1274,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
12051274
static Datum
12061275
build_regexp_split_result(regexp_matches_ctx *splitctx)
12071276
{
1277+
char *buf = splitctx->conv_buf;
12081278
int startpos;
12091279
int endpos;
12101280

@@ -1215,22 +1285,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
12151285
if (startpos < 0)
12161286
elog(ERROR, "invalid match ending position");
12171287

1218-
if (splitctx->next_match < splitctx->nmatches)
1288+
if (buf)
12191289
{
1290+
int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
1291+
int len;
1292+
12201293
endpos = splitctx->match_locs[splitctx->next_match * 2];
12211294
if (endpos < startpos)
12221295
elog(ERROR, "invalid match starting position");
1223-
return DirectFunctionCall3(text_substr,
1224-
PointerGetDatum(splitctx->orig_str),
1225-
Int32GetDatum(startpos + 1),
1226-
Int32GetDatum(endpos - startpos));
1296+
len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
1297+
buf,
1298+
endpos-startpos);
1299+
Assert(len < bufsiz);
1300+
return PointerGetDatum(cstring_to_text_with_len(buf, len));
12271301
}
12281302
else
12291303
{
1230-
/* no more matches, return rest of string */
1231-
return DirectFunctionCall2(text_substr_no_len,
1304+
endpos = splitctx->match_locs[splitctx->next_match * 2];
1305+
if (endpos < startpos)
1306+
elog(ERROR, "invalid match starting position");
1307+
return DirectFunctionCall3(text_substr,
12321308
PointerGetDatum(splitctx->orig_str),
1233-
Int32GetDatum(startpos + 1));
1309+
Int32GetDatum(startpos + 1),
1310+
Int32GetDatum(endpos - startpos));
12341311
}
12351312
}
12361313

0 commit comments

Comments
 (0)