Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 329304c

Browse files
committed
Support text position search functions with nondeterministic collations
This allows using text position search functions with nondeterministic collations. These functions are - position, strpos - replace - split_part - string_to_array - string_to_table which all use common internal infrastructure. There was previously no internal implementation of this, so it was met with a not-supported error. This adds the internal implementation and removes the error. Unlike with deterministic collations, the search cannot use any byte-by-byte optimized techniques but has to go substring by substring. We also need to consider that the found match could have a different length than the needle and that there could be substrings of different length matching at a position. In most cases, we need to find the longest such substring (greedy semantics), but this can be configured by each caller. Reviewed-by: Euler Taveira <euler@eulerto.com> Discussion: https://www.postgresql.org/message-id/flat/582b2613-0900-48ca-8b0d-340c06f4d400@eisentraut.org
1 parent 41336bf commit 329304c

File tree

3 files changed

+246
-48
lines changed

3 files changed

+246
-48
lines changed

src/backend/utils/adt/varlena.c

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ typedef struct varlena VarString;
5454
*/
5555
typedef struct
5656
{
57+
pg_locale_t locale; /* collation used for substring matching */
5758
bool is_multibyte_char_in_char; /* need to check char boundaries? */
59+
bool greedy; /* find longest possible substring? */
5860

5961
char *str1; /* haystack string */
6062
char *str2; /* needle string */
@@ -65,7 +67,13 @@ typedef struct
6567
int skiptablemask; /* mask for ANDing with skiptable subscripts */
6668
int skiptable[256]; /* skip distance for given mismatched char */
6769

70+
/*
71+
* Note that with nondeterministic collations, the length of the last
72+
* match is not necessarily equal to the length of the "needle" passed in.
73+
*/
6874
char *last_match; /* pointer to last match in 'str1' */
75+
int last_match_len; /* length of last match */
76+
int last_match_len_tmp; /* same but for internal use */
6977

7078
/*
7179
* Sometimes we need to convert the byte position of a match to a
@@ -1178,15 +1186,21 @@ text_position(text *t1, text *t2, Oid collid)
11781186
TextPositionState state;
11791187
int result;
11801188

1189+
check_collation_set(collid);
1190+
11811191
/* Empty needle always matches at position 1 */
11821192
if (VARSIZE_ANY_EXHDR(t2) < 1)
11831193
return 1;
11841194

11851195
/* Otherwise, can't match if haystack is shorter than needle */
1186-
if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2))
1196+
if (VARSIZE_ANY_EXHDR(t1) < VARSIZE_ANY_EXHDR(t2) &&
1197+
pg_newlocale_from_collation(collid)->deterministic)
11871198
return 0;
11881199

11891200
text_position_setup(t1, t2, collid, &state);
1201+
/* don't need greedy mode here */
1202+
state.greedy = false;
1203+
11901204
if (!text_position_next(&state))
11911205
result = 0;
11921206
else
@@ -1217,18 +1231,17 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
12171231
{
12181232
int len1 = VARSIZE_ANY_EXHDR(t1);
12191233
int len2 = VARSIZE_ANY_EXHDR(t2);
1220-
pg_locale_t mylocale;
12211234

12221235
check_collation_set(collid);
12231236

1224-
mylocale = pg_newlocale_from_collation(collid);
1237+
state->locale = pg_newlocale_from_collation(collid);
12251238

1226-
if (!mylocale->deterministic)
1227-
ereport(ERROR,
1228-
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1229-
errmsg("nondeterministic collations are not supported for substring searches")));
1239+
/*
1240+
* Most callers need greedy mode, but some might want to unset this to
1241+
* optimize.
1242+
*/
1243+
state->greedy = true;
12301244

1231-
Assert(len1 > 0);
12321245
Assert(len2 > 0);
12331246

12341247
/*
@@ -1264,8 +1277,11 @@ text_position_setup(text *t1, text *t2, Oid collid, TextPositionState *state)
12641277
* point in wasting cycles initializing the table. We also choose not to
12651278
* use B-M-H for needles of length 1, since the skip table can't possibly
12661279
* save anything in that case.
1280+
*
1281+
* (With nondeterministic collations, the search is already
1282+
* multibyte-aware, so we don't need this.)
12671283
*/
1268-
if (len1 >= len2 && len2 > 1)
1284+
if (len1 >= len2 && len2 > 1 && state->locale->deterministic)
12691285
{
12701286
int searchlength = len1 - len2;
12711287
int skiptablemask;
@@ -1343,7 +1359,7 @@ text_position_next(TextPositionState *state)
13431359

13441360
/* Start from the point right after the previous match. */
13451361
if (state->last_match)
1346-
start_ptr = state->last_match + needle_len;
1362+
start_ptr = state->last_match + state->last_match_len;
13471363
else
13481364
start_ptr = state->str1;
13491365

@@ -1359,7 +1375,7 @@ text_position_next(TextPositionState *state)
13591375
* multi-byte character, we need to verify that the match was at a
13601376
* character boundary, not in the middle of a multi-byte character.
13611377
*/
1362-
if (state->is_multibyte_char_in_char)
1378+
if (state->is_multibyte_char_in_char && state->locale->deterministic)
13631379
{
13641380
/* Walk one character at a time, until we reach the match. */
13651381

@@ -1387,6 +1403,7 @@ text_position_next(TextPositionState *state)
13871403
}
13881404

13891405
state->last_match = matchptr;
1406+
state->last_match_len = state->last_match_len_tmp;
13901407
return true;
13911408
}
13921409

@@ -1408,7 +1425,62 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
14081425

14091426
Assert(start_ptr >= haystack && start_ptr <= haystack_end);
14101427

1411-
if (needle_len == 1)
1428+
state->last_match_len_tmp = needle_len;
1429+
1430+
if (!state->locale->deterministic)
1431+
{
1432+
/*
1433+
* With a nondeterministic collation, we have to use an unoptimized
1434+
* route. We walk through the haystack and see if at each position
1435+
* there is a substring of the remaining string that is equal to the
1436+
* needle under the given collation.
1437+
*
1438+
* Note, the found substring could have a different length than the
1439+
* needle, including being empty. Callers that want to skip over the
1440+
* found string need to read the length of the found substring from
1441+
* last_match_len rather than just using the length of their needle.
1442+
*
1443+
* Most callers will require "greedy" semantics, meaning that we need
1444+
* to find the longest such substring, not the shortest. For callers
1445+
* that don't need greedy semantics, we can finish on the first match.
1446+
*/
1447+
const char *result_hptr = NULL;
1448+
1449+
hptr = start_ptr;
1450+
while (hptr < haystack_end)
1451+
{
1452+
/*
1453+
* First check the common case that there is a match in the
1454+
* haystack of exactly the length of the needle.
1455+
*/
1456+
if (!state->greedy &&
1457+
haystack_end - hptr >= needle_len &&
1458+
pg_strncoll(hptr, needle_len, needle, needle_len, state->locale) == 0)
1459+
return (char *) hptr;
1460+
1461+
/*
1462+
* Else check if any of the possible substrings starting at hptr
1463+
* are equal to the needle.
1464+
*/
1465+
for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
1466+
{
1467+
if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
1468+
{
1469+
state->last_match_len_tmp = (test_end - hptr);
1470+
result_hptr = hptr;
1471+
if (!state->greedy)
1472+
break;
1473+
}
1474+
}
1475+
if (result_hptr)
1476+
break;
1477+
1478+
hptr += pg_mblen(hptr);
1479+
}
1480+
1481+
return (char *) result_hptr;
1482+
}
1483+
else if (needle_len == 1)
14121484
{
14131485
/* No point in using B-M-H for a one-character needle */
14141486
char nchar = *needle;
@@ -4055,7 +4127,7 @@ replace_text(PG_FUNCTION_ARGS)
40554127

40564128
appendStringInfoText(&str, to_sub_text);
40574129

4058-
start_ptr = curr_ptr + from_sub_text_len;
4130+
start_ptr = curr_ptr + state.last_match_len;
40594131

40604132
found = text_position_next(&state);
40614133
if (found)
@@ -4445,7 +4517,7 @@ split_part(PG_FUNCTION_ARGS)
44454517
/* special case of last field does not require an extra pass */
44464518
if (fldnum == -1)
44474519
{
4448-
start_ptr = text_position_get_match_ptr(&state) + fldsep_len;
4520+
start_ptr = text_position_get_match_ptr(&state) + state.last_match_len;
44494521
end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
44504522
text_position_cleanup(&state);
44514523
PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
@@ -4475,7 +4547,7 @@ split_part(PG_FUNCTION_ARGS)
44754547
while (found && --fldnum > 0)
44764548
{
44774549
/* identify bounds of next field */
4478-
start_ptr = end_ptr + fldsep_len;
4550+
start_ptr = end_ptr + state.last_match_len;
44794551
found = text_position_next(&state);
44804552
if (found)
44814553
end_ptr = text_position_get_match_ptr(&state);
@@ -4691,7 +4763,7 @@ split_text(FunctionCallInfo fcinfo, SplitTextOutputData *tstate)
46914763
if (!found)
46924764
break;
46934765

4694-
start_ptr = end_ptr + fldsep_len;
4766+
start_ptr = end_ptr + state.last_match_len;
46954767
}
46964768

46974769
text_position_cleanup(&state);

0 commit comments

Comments
 (0)