35
35
#include "regex/regex.h"
36
36
#include "utils/array.h"
37
37
#include "utils/builtins.h"
38
+ #include "utils/memutils.h"
38
39
#include "utils/varlena.h"
39
40
40
41
#define PG_GETARG_TEXT_PP_IF_EXISTS (_n ) \
@@ -61,6 +62,9 @@ typedef struct regexp_matches_ctx
61
62
/* workspace for build_regexp_match_result() */
62
63
Datum * elems ; /* has npatterns elements */
63
64
bool * nulls ; /* has npatterns elements */
65
+ pg_wchar * wide_str ; /* wide-char version of original string */
66
+ char * conv_buf ; /* conversion buffer */
67
+ int conv_bufsiz ; /* size thereof */
64
68
} regexp_matches_ctx ;
65
69
66
70
/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111
115
pg_re_flags * flags ,
112
116
Oid collation ,
113
117
bool use_subpatterns ,
114
- bool ignore_degenerate );
115
- static void cleanup_regexp_matches ( regexp_matches_ctx * matchctx );
118
+ bool ignore_degenerate ,
119
+ bool fetching_unmatched );
116
120
static ArrayType * build_regexp_match_result (regexp_matches_ctx * matchctx );
117
121
static Datum build_regexp_split_result (regexp_matches_ctx * splitctx );
118
122
@@ -863,7 +867,7 @@ regexp_match(PG_FUNCTION_ARGS)
863
867
errhint ("Use the regexp_matches function instead." )));
864
868
865
869
matchctx = setup_regexp_matches (orig_str , pattern , & re_flags ,
866
- PG_GET_COLLATION (), true, false);
870
+ PG_GET_COLLATION (), true, false, false );
867
871
868
872
if (matchctx -> nmatches == 0 )
869
873
PG_RETURN_NULL ();
@@ -911,7 +915,7 @@ regexp_matches(PG_FUNCTION_ARGS)
911
915
matchctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ), pattern ,
912
916
& re_flags ,
913
917
PG_GET_COLLATION (),
914
- true, false);
918
+ true, false, false );
915
919
916
920
/* Pre-create workspace that build_regexp_match_result needs */
917
921
matchctx -> elems = (Datum * ) palloc (sizeof (Datum ) * matchctx -> npatterns );
@@ -933,9 +937,6 @@ regexp_matches(PG_FUNCTION_ARGS)
933
937
SRF_RETURN_NEXT (funcctx , PointerGetDatum (result_ary ));
934
938
}
935
939
936
- /* release space in multi-call ctx to avoid intraquery memory leak */
937
- cleanup_regexp_matches (matchctx );
938
-
939
940
SRF_RETURN_DONE (funcctx );
940
941
}
941
942
@@ -954,17 +955,24 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
954
955
* all the matching in one swoop. The returned regexp_matches_ctx contains
955
956
* the locations of all the substrings matching the pattern.
956
957
*
957
- * The two bool parameters have only two patterns (one for matching, one for
958
+ * The three bool parameters have only two patterns (one for matching, one for
958
959
* splitting) but it seems clearer to distinguish the functionality this way
959
- * than to key it all off one "is_split" flag.
960
+ * than to key it all off one "is_split" flag. We don't currently assume that
961
+ * fetching_unmatched is exclusive of fetching the matched text too; if it's
962
+ * set, the conversion buffer is large enough to fetch any single matched or
963
+ * unmatched string, but not any larger substring. (In practice, when splitting
964
+ * the matches are usually small anyway, and it didn't seem worth complicating
965
+ * the code further.)
960
966
*/
961
967
static regexp_matches_ctx *
962
968
setup_regexp_matches (text * orig_str , text * pattern , pg_re_flags * re_flags ,
963
969
Oid collation ,
964
970
bool use_subpatterns ,
965
- bool ignore_degenerate )
971
+ bool ignore_degenerate ,
972
+ bool fetching_unmatched )
966
973
{
967
974
regexp_matches_ctx * matchctx = palloc0 (sizeof (regexp_matches_ctx ));
975
+ int eml = pg_database_encoding_max_length ();
968
976
int orig_len ;
969
977
pg_wchar * wide_str ;
970
978
int wide_len ;
@@ -975,6 +983,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
975
983
int array_idx ;
976
984
int prev_match_end ;
977
985
int start_search ;
986
+ int maxlen = 0 ; /* largest fetch length in characters */
978
987
979
988
/* save original string --- we'll extract result substrings from it */
980
989
matchctx -> orig_str = orig_str ;
@@ -1003,8 +1012,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
1003
1012
/* temporary output space for RE package */
1004
1013
pmatch = palloc (sizeof (regmatch_t ) * pmatch_len );
1005
1014
1006
- /* the real output space (grown dynamically if needed) */
1007
- array_len = re_flags -> glob ? 256 : 32 ;
1015
+ /*
1016
+ * the real output space (grown dynamically if needed)
1017
+ *
1018
+ * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
1019
+ * than at 2^27
1020
+ */
1021
+ array_len = re_flags -> glob ? 255 : 31 ;
1008
1022
matchctx -> match_locs = (int * ) palloc (sizeof (int ) * array_len );
1009
1023
array_idx = 0 ;
1010
1024
@@ -1024,9 +1038,13 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
1024
1038
pmatch [0 ].rm_eo > prev_match_end ))
1025
1039
{
1026
1040
/* enlarge output space if needed */
1027
- while (array_idx + matchctx -> npatterns * 2 > array_len )
1041
+ while (array_idx + matchctx -> npatterns * 2 + 1 > array_len )
1028
1042
{
1029
- array_len *= 2 ;
1043
+ array_len += array_len + 1 ; /* 2^n-1 => 2^(n+1)-1 */
1044
+ if (array_len > MaxAllocSize /sizeof (int ))
1045
+ ereport (ERROR ,
1046
+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
1047
+ errmsg ("too many regular expression matches" )));
1030
1048
matchctx -> match_locs = (int * ) repalloc (matchctx -> match_locs ,
1031
1049
sizeof (int ) * array_len );
1032
1050
}
@@ -1038,16 +1056,33 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
1038
1056
1039
1057
for (i = 1 ; i <= matchctx -> npatterns ; i ++ )
1040
1058
{
1041
- matchctx -> match_locs [array_idx ++ ] = pmatch [i ].rm_so ;
1042
- matchctx -> match_locs [array_idx ++ ] = pmatch [i ].rm_eo ;
1059
+ int so = pmatch [i ].rm_so ;
1060
+ int eo = pmatch [i ].rm_eo ;
1061
+ matchctx -> match_locs [array_idx ++ ] = so ;
1062
+ matchctx -> match_locs [array_idx ++ ] = eo ;
1063
+ if (so >= 0 && eo >= 0 && (eo - so ) > maxlen )
1064
+ maxlen = (eo - so );
1043
1065
}
1044
1066
}
1045
1067
else
1046
1068
{
1047
- matchctx -> match_locs [array_idx ++ ] = pmatch [0 ].rm_so ;
1048
- matchctx -> match_locs [array_idx ++ ] = pmatch [0 ].rm_eo ;
1069
+ int so = pmatch [0 ].rm_so ;
1070
+ int eo = pmatch [0 ].rm_eo ;
1071
+ matchctx -> match_locs [array_idx ++ ] = so ;
1072
+ matchctx -> match_locs [array_idx ++ ] = eo ;
1073
+ if (so >= 0 && eo >= 0 && (eo - so ) > maxlen )
1074
+ maxlen = (eo - so );
1049
1075
}
1050
1076
matchctx -> nmatches ++ ;
1077
+
1078
+ /*
1079
+ * check length of unmatched portion between end of previous match
1080
+ * and start of current one
1081
+ */
1082
+ if (fetching_unmatched &&
1083
+ pmatch [0 ].rm_so >= 0 &&
1084
+ (pmatch [0 ].rm_so - prev_match_end ) > maxlen )
1085
+ maxlen = (pmatch [0 ].rm_so - prev_match_end );
1051
1086
}
1052
1087
prev_match_end = pmatch [0 ].rm_eo ;
1053
1088
@@ -1068,34 +1103,67 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
1068
1103
break ;
1069
1104
}
1070
1105
1106
+ /*
1107
+ * check length of unmatched portion between end of last match and end of
1108
+ * input string
1109
+ */
1110
+ if (fetching_unmatched &&
1111
+ (wide_len - prev_match_end ) > maxlen )
1112
+ maxlen = (wide_len - prev_match_end );
1113
+
1114
+ /*
1115
+ * Keep a note of the end position of the string for the benefit of
1116
+ * splitting code.
1117
+ */
1118
+ matchctx -> match_locs [array_idx ] = wide_len ;
1119
+
1120
+ if (eml > 1 )
1121
+ {
1122
+ int64 maxsiz = eml * (int64 ) maxlen ;
1123
+ int conv_bufsiz ;
1124
+
1125
+ /*
1126
+ * Make the conversion buffer large enough for any substring of
1127
+ * interest.
1128
+ *
1129
+ * Worst case: assume we need the maximum size (maxlen*eml), but take
1130
+ * advantage of the fact that the original string length in bytes is an
1131
+ * upper bound on the byte length of any fetched substring (and we know
1132
+ * that len+1 is safe to allocate because the varlena header is longer
1133
+ * than 1 byte).
1134
+ */
1135
+ if (maxsiz > orig_len )
1136
+ conv_bufsiz = orig_len + 1 ;
1137
+ else
1138
+ conv_bufsiz = maxsiz + 1 ; /* safe since maxsiz < 2^30 */
1139
+
1140
+ matchctx -> conv_buf = palloc (conv_bufsiz );
1141
+ matchctx -> conv_bufsiz = conv_bufsiz ;
1142
+ matchctx -> wide_str = wide_str ;
1143
+ }
1144
+ else
1145
+ {
1146
+ /* No need to keep the wide string if we're in a single-byte charset. */
1147
+ pfree (wide_str );
1148
+ matchctx -> wide_str = NULL ;
1149
+ matchctx -> conv_buf = NULL ;
1150
+ matchctx -> conv_bufsiz = 0 ;
1151
+ }
1152
+
1071
1153
/* Clean up temp storage */
1072
- pfree (wide_str );
1073
1154
pfree (pmatch );
1074
1155
1075
1156
return matchctx ;
1076
1157
}
1077
1158
1078
- /*
1079
- * cleanup_regexp_matches - release memory of a regexp_matches_ctx
1080
- */
1081
- static void
1082
- cleanup_regexp_matches (regexp_matches_ctx * matchctx )
1083
- {
1084
- pfree (matchctx -> orig_str );
1085
- pfree (matchctx -> match_locs );
1086
- if (matchctx -> elems )
1087
- pfree (matchctx -> elems );
1088
- if (matchctx -> nulls )
1089
- pfree (matchctx -> nulls );
1090
- pfree (matchctx );
1091
- }
1092
-
1093
1159
/*
1094
1160
* build_regexp_match_result - build output array for current match
1095
1161
*/
1096
1162
static ArrayType *
1097
1163
build_regexp_match_result (regexp_matches_ctx * matchctx )
1098
1164
{
1165
+ char * buf = matchctx -> conv_buf ;
1166
+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx -> conv_bufsiz ;
1099
1167
Datum * elems = matchctx -> elems ;
1100
1168
bool * nulls = matchctx -> nulls ;
1101
1169
int dims [1 ];
@@ -1115,6 +1183,15 @@ build_regexp_match_result(regexp_matches_ctx *matchctx)
1115
1183
elems [i ] = (Datum ) 0 ;
1116
1184
nulls [i ] = true;
1117
1185
}
1186
+ else if (buf )
1187
+ {
1188
+ int len = pg_wchar2mb_with_len (matchctx -> wide_str + so ,
1189
+ buf ,
1190
+ eo - so );
1191
+ Assert (len < bufsiz );
1192
+ elems [i ] = PointerGetDatum (cstring_to_text_with_len (buf , len ));
1193
+ nulls [i ] = false;
1194
+ }
1118
1195
else
1119
1196
{
1120
1197
elems [i ] = DirectFunctionCall3 (text_substr ,
@@ -1168,7 +1245,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
1168
1245
splitctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ), pattern ,
1169
1246
& re_flags ,
1170
1247
PG_GET_COLLATION (),
1171
- false, true);
1248
+ false, true, true );
1172
1249
1173
1250
MemoryContextSwitchTo (oldcontext );
1174
1251
funcctx -> user_fctx = (void * ) splitctx ;
@@ -1185,9 +1262,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
1185
1262
SRF_RETURN_NEXT (funcctx , result );
1186
1263
}
1187
1264
1188
- /* release space in multi-call ctx to avoid intraquery memory leak */
1189
- cleanup_regexp_matches (splitctx );
1190
-
1191
1265
SRF_RETURN_DONE (funcctx );
1192
1266
}
1193
1267
@@ -1224,7 +1298,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
1224
1298
PG_GETARG_TEXT_PP (1 ),
1225
1299
& re_flags ,
1226
1300
PG_GET_COLLATION (),
1227
- false, true);
1301
+ false, true, true );
1228
1302
1229
1303
while (splitctx -> next_match <= splitctx -> nmatches )
1230
1304
{
@@ -1236,12 +1310,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
1236
1310
splitctx -> next_match ++ ;
1237
1311
}
1238
1312
1239
- /*
1240
- * We don't call cleanup_regexp_matches here; it would try to pfree the
1241
- * input string, which we didn't copy. The space is not in a long-lived
1242
- * memory context anyway.
1243
- */
1244
-
1245
1313
PG_RETURN_ARRAYTYPE_P (makeArrayResult (astate , CurrentMemoryContext ));
1246
1314
}
1247
1315
@@ -1261,6 +1329,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
1261
1329
static Datum
1262
1330
build_regexp_split_result (regexp_matches_ctx * splitctx )
1263
1331
{
1332
+ char * buf = splitctx -> conv_buf ;
1264
1333
int startpos ;
1265
1334
int endpos ;
1266
1335
@@ -1271,22 +1340,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
1271
1340
if (startpos < 0 )
1272
1341
elog (ERROR , "invalid match ending position" );
1273
1342
1274
- if (splitctx -> next_match < splitctx -> nmatches )
1343
+ if (buf )
1275
1344
{
1345
+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx -> conv_bufsiz ;
1346
+ int len ;
1347
+
1276
1348
endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
1277
1349
if (endpos < startpos )
1278
1350
elog (ERROR , "invalid match starting position" );
1279
- return DirectFunctionCall3 (text_substr ,
1280
- PointerGetDatum (splitctx -> orig_str ),
1281
- Int32GetDatum (startpos + 1 ),
1282
- Int32GetDatum (endpos - startpos ));
1351
+ len = pg_wchar2mb_with_len (splitctx -> wide_str + startpos ,
1352
+ buf ,
1353
+ endpos - startpos );
1354
+ Assert (len < bufsiz );
1355
+ return PointerGetDatum (cstring_to_text_with_len (buf , len ));
1283
1356
}
1284
1357
else
1285
1358
{
1286
- /* no more matches, return rest of string */
1287
- return DirectFunctionCall2 (text_substr_no_len ,
1359
+ endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
1360
+ if (endpos < startpos )
1361
+ elog (ERROR , "invalid match starting position" );
1362
+ return DirectFunctionCall3 (text_substr ,
1288
1363
PointerGetDatum (splitctx -> orig_str ),
1289
- Int32GetDatum (startpos + 1 ));
1364
+ Int32GetDatum (startpos + 1 ),
1365
+ Int32GetDatum (endpos - startpos ));
1290
1366
}
1291
1367
}
1292
1368
0 commit comments