@@ -54,7 +54,9 @@ typedef struct varlena VarString;
54
54
*/
55
55
typedef struct
56
56
{
57
+ pg_locale_t locale ; /* collation used for substring matching */
57
58
bool is_multibyte_char_in_char ; /* need to check char boundaries? */
59
+ bool greedy ; /* find longest possible substring? */
58
60
59
61
char * str1 ; /* haystack string */
60
62
char * str2 ; /* needle string */
@@ -65,7 +67,13 @@ typedef struct
65
67
int skiptablemask ; /* mask for ANDing with skiptable subscripts */
66
68
int skiptable [256 ]; /* skip distance for given mismatched char */
67
69
70
+ /*
71
+ * Note that with nondeterministic collations, the length of the last
72
+ * match is not necessarily equal to the length of the "needle" passed in.
73
+ */
68
74
char * last_match ; /* pointer to last match in 'str1' */
75
+ int last_match_len ; /* length of last match */
76
+ int last_match_len_tmp ; /* same but for internal use */
69
77
70
78
/*
71
79
* Sometimes we need to convert the byte position of a match to a
@@ -1178,15 +1186,21 @@ text_position(text *t1, text *t2, Oid collid)
1178
1186
TextPositionState state ;
1179
1187
int result ;
1180
1188
1189
+ check_collation_set (collid );
1190
+
1181
1191
/* Empty needle always matches at position 1 */
1182
1192
if (VARSIZE_ANY_EXHDR (t2 ) < 1 )
1183
1193
return 1 ;
1184
1194
1185
1195
/* Otherwise, can't match if haystack is shorter than needle */
1186
- if (VARSIZE_ANY_EXHDR (t1 ) < VARSIZE_ANY_EXHDR (t2 ))
1196
+ if (VARSIZE_ANY_EXHDR (t1 ) < VARSIZE_ANY_EXHDR (t2 ) &&
1197
+ pg_newlocale_from_collation (collid )-> deterministic )
1187
1198
return 0 ;
1188
1199
1189
1200
text_position_setup (t1 , t2 , collid , & state );
1201
+ /* don't need greedy mode here */
1202
+ state .greedy = false;
1203
+
1190
1204
if (!text_position_next (& state ))
1191
1205
result = 0 ;
1192
1206
else
@@ -1217,18 +1231,17 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1217
1231
{
1218
1232
int len1 = VARSIZE_ANY_EXHDR (t1 );
1219
1233
int len2 = VARSIZE_ANY_EXHDR (t2 );
1220
- pg_locale_t mylocale ;
1221
1234
1222
1235
check_collation_set (collid );
1223
1236
1224
- mylocale = pg_newlocale_from_collation (collid );
1237
+ state -> locale = pg_newlocale_from_collation (collid );
1225
1238
1226
- if (!mylocale -> deterministic )
1227
- ereport (ERROR ,
1228
- (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
1229
- errmsg ("nondeterministic collations are not supported for substring searches" )));
1239
+ /*
1240
+ * Most callers need greedy mode, but some might want to unset this to
1241
+ * optimize.
1242
+ */
1243
+ state -> greedy = true;
1230
1244
1231
- Assert (len1 > 0 );
1232
1245
Assert (len2 > 0 );
1233
1246
1234
1247
/*
@@ -1264,8 +1277,11 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
1264
1277
* point in wasting cycles initializing the table. We also choose not to
1265
1278
* use B-M-H for needles of length 1, since the skip table can't possibly
1266
1279
* save anything in that case.
1280
+ *
1281
+ * (With nondeterministic collations, the search is already
1282
+ * multibyte-aware, so we don't need this.)
1267
1283
*/
1268
- if (len1 >= len2 && len2 > 1 )
1284
+ if (len1 >= len2 && len2 > 1 && state -> locale -> deterministic )
1269
1285
{
1270
1286
int searchlength = len1 - len2 ;
1271
1287
int skiptablemask ;
@@ -1343,7 +1359,7 @@ text_position_next(TextPositionState *state)
1343
1359
1344
1360
/* Start from the point right after the previous match. */
1345
1361
if (state -> last_match )
1346
- start_ptr = state -> last_match + needle_len ;
1362
+ start_ptr = state -> last_match + state -> last_match_len ;
1347
1363
else
1348
1364
start_ptr = state -> str1 ;
1349
1365
@@ -1359,7 +1375,7 @@ text_position_next(TextPositionState *state)
1359
1375
* multi-byte character, we need to verify that the match was at a
1360
1376
* character boundary, not in the middle of a multi-byte character.
1361
1377
*/
1362
- if (state -> is_multibyte_char_in_char )
1378
+ if (state -> is_multibyte_char_in_char && state -> locale -> deterministic )
1363
1379
{
1364
1380
/* Walk one character at a time, until we reach the match. */
1365
1381
@@ -1387,6 +1403,7 @@ text_position_next(TextPositionState *state)
1387
1403
}
1388
1404
1389
1405
state -> last_match = matchptr ;
1406
+ state -> last_match_len = state -> last_match_len_tmp ;
1390
1407
return true;
1391
1408
}
1392
1409
@@ -1408,7 +1425,62 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
1408
1425
1409
1426
Assert (start_ptr >= haystack && start_ptr <= haystack_end );
1410
1427
1411
- if (needle_len == 1 )
1428
+ state -> last_match_len_tmp = needle_len ;
1429
+
1430
+ if (!state -> locale -> deterministic )
1431
+ {
1432
+ /*
1433
+ * With a nondeterministic collation, we have to use an unoptimized
1434
+ * route. We walk through the haystack and see if at each position
1435
+ * there is a substring of the remaining string that is equal to the
1436
+ * needle under the given collation.
1437
+ *
1438
+ * Note, the found substring could have a different length than the
1439
+ * needle, including being empty. Callers that want to skip over the
1440
+ * found string need to read the length of the found substring from
1441
+ * last_match_len rather than just using the length of their needle.
1442
+ *
1443
+ * Most callers will require "greedy" semantics, meaning that we need
1444
+ * to find the longest such substring, not the shortest. For callers
1445
+ * that don't need greedy semantics, we can finish on the first match.
1446
+ */
1447
+ const char * result_hptr = NULL ;
1448
+
1449
+ hptr = start_ptr ;
1450
+ while (hptr < haystack_end )
1451
+ {
1452
+ /*
1453
+ * First check the common case that there is a match in the
1454
+ * haystack of exactly the length of the needle.
1455
+ */
1456
+ if (!state -> greedy &&
1457
+ haystack_end - hptr >= needle_len &&
1458
+ pg_strncoll (hptr , needle_len , needle , needle_len , state -> locale ) == 0 )
1459
+ return (char * ) hptr ;
1460
+
1461
+ /*
1462
+ * Else check if any of the possible substrings starting at hptr
1463
+ * are equal to the needle.
1464
+ */
1465
+ for (const char * test_end = hptr ; test_end < haystack_end ; test_end += pg_mblen (test_end ))
1466
+ {
1467
+ if (pg_strncoll (hptr , (test_end - hptr ), needle , needle_len , state -> locale ) == 0 )
1468
+ {
1469
+ state -> last_match_len_tmp = (test_end - hptr );
1470
+ result_hptr = hptr ;
1471
+ if (!state -> greedy )
1472
+ break ;
1473
+ }
1474
+ }
1475
+ if (result_hptr )
1476
+ break ;
1477
+
1478
+ hptr += pg_mblen (hptr );
1479
+ }
1480
+
1481
+ return (char * ) result_hptr ;
1482
+ }
1483
+ else if (needle_len == 1 )
1412
1484
{
1413
1485
/* No point in using B-M-H for a one-character needle */
1414
1486
char nchar = * needle ;
@@ -4055,7 +4127,7 @@ replace_text(PG_FUNCTION_ARGS)
4055
4127
4056
4128
appendStringInfoText (& str , to_sub_text );
4057
4129
4058
- start_ptr = curr_ptr + from_sub_text_len ;
4130
+ start_ptr = curr_ptr + state . last_match_len ;
4059
4131
4060
4132
found = text_position_next (& state );
4061
4133
if (found )
@@ -4445,7 +4517,7 @@ split_part(PG_FUNCTION_ARGS)
4445
4517
/* special case of last field does not require an extra pass */
4446
4518
if (fldnum == -1 )
4447
4519
{
4448
- start_ptr = text_position_get_match_ptr (& state ) + fldsep_len ;
4520
+ start_ptr = text_position_get_match_ptr (& state ) + state . last_match_len ;
4449
4521
end_ptr = VARDATA_ANY (inputstring ) + inputstring_len ;
4450
4522
text_position_cleanup (& state );
4451
4523
PG_RETURN_TEXT_P (cstring_to_text_with_len (start_ptr ,
@@ -4475,7 +4547,7 @@ split_part(PG_FUNCTION_ARGS)
4475
4547
while (found && -- fldnum > 0 )
4476
4548
{
4477
4549
/* identify bounds of next field */
4478
- start_ptr = end_ptr + fldsep_len ;
4550
+ start_ptr = end_ptr + state . last_match_len ;
4479
4551
found = text_position_next (& state );
4480
4552
if (found )
4481
4553
end_ptr = text_position_get_match_ptr (& state );
@@ -4691,7 +4763,7 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
4691
4763
if (!found )
4692
4764
break ;
4693
4765
4694
- start_ptr = end_ptr + fldsep_len ;
4766
+ start_ptr = end_ptr + state . last_match_len ;
4695
4767
}
4696
4768
4697
4769
text_position_cleanup (& state );
0 commit comments