35
35
#include "regex/regex.h"
36
36
#include "utils/array.h"
37
37
#include "utils/builtins.h"
38
+ #include "utils/memutils.h"
38
39
39
40
#define PG_GETARG_TEXT_PP_IF_EXISTS (_n ) \
40
41
(PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
@@ -60,6 +61,9 @@ typedef struct regexp_matches_ctx
60
61
/* workspace for build_regexp_matches_result() */
61
62
Datum * elems ; /* has npatterns elements */
62
63
bool * nulls ; /* has npatterns elements */
64
+ pg_wchar * wide_str ; /* wide-char version of original string */
65
+ char * conv_buf ; /* conversion buffer */
66
+ int conv_bufsiz ; /* size thereof */
63
67
} regexp_matches_ctx ;
64
68
65
69
/*
@@ -111,8 +115,8 @@ static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111
115
Oid collation ,
112
116
bool force_glob ,
113
117
bool use_subpatterns ,
114
- bool ignore_degenerate );
115
- static void cleanup_regexp_matches ( regexp_matches_ctx * matchctx );
118
+ bool ignore_degenerate ,
119
+ bool fetching_unmatched );
116
120
static ArrayType * build_regexp_matches_result (regexp_matches_ctx * matchctx );
117
121
static Datum build_regexp_split_result (regexp_matches_ctx * splitctx );
118
122
@@ -863,7 +867,7 @@ regexp_matches(PG_FUNCTION_ARGS)
863
867
matchctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ), pattern ,
864
868
flags ,
865
869
PG_GET_COLLATION (),
866
- false, true, false);
870
+ false, true, false, false );
867
871
868
872
/* Pre-create workspace that build_regexp_matches_result needs */
869
873
matchctx -> elems = (Datum * ) palloc (sizeof (Datum ) * matchctx -> npatterns );
@@ -885,9 +889,6 @@ regexp_matches(PG_FUNCTION_ARGS)
885
889
SRF_RETURN_NEXT (funcctx , PointerGetDatum (result_ary ));
886
890
}
887
891
888
- /* release space in multi-call ctx to avoid intraquery memory leak */
889
- cleanup_regexp_matches (matchctx );
890
-
891
892
SRF_RETURN_DONE (funcctx );
892
893
}
893
894
@@ -906,17 +907,25 @@ regexp_matches_no_flags(PG_FUNCTION_ARGS)
906
907
* all the matching in one swoop. The returned regexp_matches_ctx contains
907
908
* the locations of all the substrings matching the pattern.
908
909
*
909
- * The three bool parameters have only two patterns (one for each caller)
910
- * but it seems clearer to distinguish the functionality this way than to
911
- * key it all off one "is_split" flag.
910
+ * The four bool parameters have only two patterns (one for matching, one for
911
+ * splitting) but it seems clearer to distinguish the functionality this way
912
+ * than to key it all off one "is_split" flag. We don't currently assume that
913
+ * fetching_unmatched is exclusive of fetching the matched text too; if it's
914
+ * set, the conversion buffer is large enough to fetch any single matched or
915
+ * unmatched string, but not any larger substring. (In practice, when splitting
916
+ * the matches are usually small anyway, and it didn't seem worth complicating
917
+ * the code further.)
912
918
*/
913
919
static regexp_matches_ctx *
914
920
setup_regexp_matches (text * orig_str , text * pattern , text * flags ,
915
921
Oid collation ,
916
- bool force_glob , bool use_subpatterns ,
917
- bool ignore_degenerate )
922
+ bool force_glob ,
923
+ bool use_subpatterns ,
924
+ bool ignore_degenerate ,
925
+ bool fetching_unmatched )
918
926
{
919
927
regexp_matches_ctx * matchctx = palloc0 (sizeof (regexp_matches_ctx ));
928
+ int eml = pg_database_encoding_max_length ();
920
929
int orig_len ;
921
930
pg_wchar * wide_str ;
922
931
int wide_len ;
@@ -928,6 +937,7 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
928
937
int array_idx ;
929
938
int prev_match_end ;
930
939
int start_search ;
940
+ int maxlen = 0 ; /* largest fetch length in characters */
931
941
932
942
/* save original string --- we'll extract result substrings from it */
933
943
matchctx -> orig_str = orig_str ;
@@ -969,8 +979,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
969
979
/* temporary output space for RE package */
970
980
pmatch = palloc (sizeof (regmatch_t ) * pmatch_len );
971
981
972
- /* the real output space (grown dynamically if needed) */
973
- array_len = re_flags .glob ? 256 : 32 ;
982
+ /*
983
+ * the real output space (grown dynamically if needed)
984
+ *
985
+ * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
986
+ * than at 2^27
987
+ */
988
+ array_len = re_flags .glob ? 255 : 31 ;
974
989
matchctx -> match_locs = (int * ) palloc (sizeof (int ) * array_len );
975
990
array_idx = 0 ;
976
991
@@ -990,9 +1005,13 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
990
1005
pmatch [0 ].rm_eo > prev_match_end ))
991
1006
{
992
1007
/* enlarge output space if needed */
993
- while (array_idx + matchctx -> npatterns * 2 > array_len )
1008
+ while (array_idx + matchctx -> npatterns * 2 + 1 > array_len )
994
1009
{
995
- array_len *= 2 ;
1010
+ array_len += array_len + 1 ; /* 2^n-1 => 2^(n+1)-1 */
1011
+ if (array_len > MaxAllocSize /sizeof (int ))
1012
+ ereport (ERROR ,
1013
+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
1014
+ errmsg ("too many regular expression matches" )));
996
1015
matchctx -> match_locs = (int * ) repalloc (matchctx -> match_locs ,
997
1016
sizeof (int ) * array_len );
998
1017
}
@@ -1004,16 +1023,33 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
1004
1023
1005
1024
for (i = 1 ; i <= matchctx -> npatterns ; i ++ )
1006
1025
{
1007
- matchctx -> match_locs [array_idx ++ ] = pmatch [i ].rm_so ;
1008
- matchctx -> match_locs [array_idx ++ ] = pmatch [i ].rm_eo ;
1026
+ int so = pmatch [i ].rm_so ;
1027
+ int eo = pmatch [i ].rm_eo ;
1028
+ matchctx -> match_locs [array_idx ++ ] = so ;
1029
+ matchctx -> match_locs [array_idx ++ ] = eo ;
1030
+ if (so >= 0 && eo >= 0 && (eo - so ) > maxlen )
1031
+ maxlen = (eo - so );
1009
1032
}
1010
1033
}
1011
1034
else
1012
1035
{
1013
- matchctx -> match_locs [array_idx ++ ] = pmatch [0 ].rm_so ;
1014
- matchctx -> match_locs [array_idx ++ ] = pmatch [0 ].rm_eo ;
1036
+ int so = pmatch [0 ].rm_so ;
1037
+ int eo = pmatch [0 ].rm_eo ;
1038
+ matchctx -> match_locs [array_idx ++ ] = so ;
1039
+ matchctx -> match_locs [array_idx ++ ] = eo ;
1040
+ if (so >= 0 && eo >= 0 && (eo - so ) > maxlen )
1041
+ maxlen = (eo - so );
1015
1042
}
1016
1043
matchctx -> nmatches ++ ;
1044
+
1045
+ /*
1046
+ * check length of unmatched portion between end of previous match
1047
+ * and start of current one
1048
+ */
1049
+ if (fetching_unmatched &&
1050
+ pmatch [0 ].rm_so >= 0 &&
1051
+ (pmatch [0 ].rm_so - prev_match_end ) > maxlen )
1052
+ maxlen = (pmatch [0 ].rm_so - prev_match_end );
1017
1053
}
1018
1054
prev_match_end = pmatch [0 ].rm_eo ;
1019
1055
@@ -1034,34 +1070,67 @@ setup_regexp_matches(text *orig_str, text *pattern, text *flags,
1034
1070
break ;
1035
1071
}
1036
1072
1073
+ /*
1074
+ * check length of unmatched portion between end of last match and end of
1075
+ * input string
1076
+ */
1077
+ if (fetching_unmatched &&
1078
+ (wide_len - prev_match_end ) > maxlen )
1079
+ maxlen = (wide_len - prev_match_end );
1080
+
1081
+ /*
1082
+ * Keep a note of the end position of the string for the benefit of
1083
+ * splitting code.
1084
+ */
1085
+ matchctx -> match_locs [array_idx ] = wide_len ;
1086
+
1087
+ if (eml > 1 )
1088
+ {
1089
+ int64 maxsiz = eml * (int64 ) maxlen ;
1090
+ int conv_bufsiz ;
1091
+
1092
+ /*
1093
+ * Make the conversion buffer large enough for any substring of
1094
+ * interest.
1095
+ *
1096
+ * Worst case: assume we need the maximum size (maxlen*eml), but take
1097
+ * advantage of the fact that the original string length in bytes is an
1098
+ * upper bound on the byte length of any fetched substring (and we know
1099
+ * that len+1 is safe to allocate because the varlena header is longer
1100
+ * than 1 byte).
1101
+ */
1102
+ if (maxsiz > orig_len )
1103
+ conv_bufsiz = orig_len + 1 ;
1104
+ else
1105
+ conv_bufsiz = maxsiz + 1 ; /* safe since maxsiz < 2^30 */
1106
+
1107
+ matchctx -> conv_buf = palloc (conv_bufsiz );
1108
+ matchctx -> conv_bufsiz = conv_bufsiz ;
1109
+ matchctx -> wide_str = wide_str ;
1110
+ }
1111
+ else
1112
+ {
1113
+ /* No need to keep the wide string if we're in a single-byte charset. */
1114
+ pfree (wide_str );
1115
+ matchctx -> wide_str = NULL ;
1116
+ matchctx -> conv_buf = NULL ;
1117
+ matchctx -> conv_bufsiz = 0 ;
1118
+ }
1119
+
1037
1120
/* Clean up temp storage */
1038
- pfree (wide_str );
1039
1121
pfree (pmatch );
1040
1122
1041
1123
return matchctx ;
1042
1124
}
1043
1125
1044
- /*
1045
- * cleanup_regexp_matches - release memory of a regexp_matches_ctx
1046
- */
1047
- static void
1048
- cleanup_regexp_matches (regexp_matches_ctx * matchctx )
1049
- {
1050
- pfree (matchctx -> orig_str );
1051
- pfree (matchctx -> match_locs );
1052
- if (matchctx -> elems )
1053
- pfree (matchctx -> elems );
1054
- if (matchctx -> nulls )
1055
- pfree (matchctx -> nulls );
1056
- pfree (matchctx );
1057
- }
1058
-
1059
1126
/*
1060
1127
* build_regexp_matches_result - build output array for current match
1061
1128
*/
1062
1129
static ArrayType *
1063
1130
build_regexp_matches_result (regexp_matches_ctx * matchctx )
1064
1131
{
1132
+ char * buf = matchctx -> conv_buf ;
1133
+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx -> conv_bufsiz ;
1065
1134
Datum * elems = matchctx -> elems ;
1066
1135
bool * nulls = matchctx -> nulls ;
1067
1136
int dims [1 ];
@@ -1081,6 +1150,15 @@ build_regexp_matches_result(regexp_matches_ctx *matchctx)
1081
1150
elems [i ] = (Datum ) 0 ;
1082
1151
nulls [i ] = true;
1083
1152
}
1153
+ else if (buf )
1154
+ {
1155
+ int len = pg_wchar2mb_with_len (matchctx -> wide_str + so ,
1156
+ buf ,
1157
+ eo - so );
1158
+ Assert (len < bufsiz );
1159
+ elems [i ] = PointerGetDatum (cstring_to_text_with_len (buf , len ));
1160
+ nulls [i ] = false;
1161
+ }
1084
1162
else
1085
1163
{
1086
1164
elems [i ] = DirectFunctionCall3 (text_substr ,
@@ -1123,7 +1201,7 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
1123
1201
splitctx = setup_regexp_matches (PG_GETARG_TEXT_P_COPY (0 ), pattern ,
1124
1202
flags ,
1125
1203
PG_GET_COLLATION (),
1126
- true, false, true);
1204
+ true, false, true, true );
1127
1205
1128
1206
MemoryContextSwitchTo (oldcontext );
1129
1207
funcctx -> user_fctx = (void * ) splitctx ;
@@ -1140,9 +1218,6 @@ regexp_split_to_table(PG_FUNCTION_ARGS)
1140
1218
SRF_RETURN_NEXT (funcctx , result );
1141
1219
}
1142
1220
1143
- /* release space in multi-call ctx to avoid intraquery memory leak */
1144
- cleanup_regexp_matches (splitctx );
1145
-
1146
1221
SRF_RETURN_DONE (funcctx );
1147
1222
}
1148
1223
@@ -1168,7 +1243,7 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
1168
1243
PG_GETARG_TEXT_PP (1 ),
1169
1244
PG_GETARG_TEXT_PP_IF_EXISTS (2 ),
1170
1245
PG_GET_COLLATION (),
1171
- true, false, true);
1246
+ true, false, true, true );
1172
1247
1173
1248
while (splitctx -> next_match <= splitctx -> nmatches )
1174
1249
{
@@ -1180,12 +1255,6 @@ regexp_split_to_array(PG_FUNCTION_ARGS)
1180
1255
splitctx -> next_match ++ ;
1181
1256
}
1182
1257
1183
- /*
1184
- * We don't call cleanup_regexp_matches here; it would try to pfree the
1185
- * input string, which we didn't copy. The space is not in a long-lived
1186
- * memory context anyway.
1187
- */
1188
-
1189
1258
PG_RETURN_ARRAYTYPE_P (makeArrayResult (astate , CurrentMemoryContext ));
1190
1259
}
1191
1260
@@ -1205,6 +1274,7 @@ regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
1205
1274
static Datum
1206
1275
build_regexp_split_result (regexp_matches_ctx * splitctx )
1207
1276
{
1277
+ char * buf = splitctx -> conv_buf ;
1208
1278
int startpos ;
1209
1279
int endpos ;
1210
1280
@@ -1215,22 +1285,29 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
1215
1285
if (startpos < 0 )
1216
1286
elog (ERROR , "invalid match ending position" );
1217
1287
1218
- if (splitctx -> next_match < splitctx -> nmatches )
1288
+ if (buf )
1219
1289
{
1290
+ int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx -> conv_bufsiz ;
1291
+ int len ;
1292
+
1220
1293
endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
1221
1294
if (endpos < startpos )
1222
1295
elog (ERROR , "invalid match starting position" );
1223
- return DirectFunctionCall3 (text_substr ,
1224
- PointerGetDatum (splitctx -> orig_str ),
1225
- Int32GetDatum (startpos + 1 ),
1226
- Int32GetDatum (endpos - startpos ));
1296
+ len = pg_wchar2mb_with_len (splitctx -> wide_str + startpos ,
1297
+ buf ,
1298
+ endpos - startpos );
1299
+ Assert (len < bufsiz );
1300
+ return PointerGetDatum (cstring_to_text_with_len (buf , len ));
1227
1301
}
1228
1302
else
1229
1303
{
1230
- /* no more matches, return rest of string */
1231
- return DirectFunctionCall2 (text_substr_no_len ,
1304
+ endpos = splitctx -> match_locs [splitctx -> next_match * 2 ];
1305
+ if (endpos < startpos )
1306
+ elog (ERROR , "invalid match starting position" );
1307
+ return DirectFunctionCall3 (text_substr ,
1232
1308
PointerGetDatum (splitctx -> orig_str ),
1233
- Int32GetDatum (startpos + 1 ));
1309
+ Int32GetDatum (startpos + 1 ),
1310
+ Int32GetDatum (endpos - startpos ));
1234
1311
}
1235
1312
}
1236
1313
0 commit comments