Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 1b24887

Browse files
committed
Allow multi-character source strings in contrib/unaccent.
This could be useful in languages where diacritic signs are represented as separate characters; more generally it supports using unaccent dictionaries for substring substitutions beyond narrowly conceived "diacritic removal". In any case, since the rule-file parser doesn't complain about multi-character source strings, it behooves us to do something unsurprising with them.
1 parent 97c40ce commit 1b24887

File tree

2 files changed

+67
-32
lines changed

2 files changed

+67
-32
lines changed

contrib/unaccent/unaccent.c

+59-32
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,16 @@
2323
PG_MODULE_MAGIC;
2424

2525
/*
26-
* Unaccent dictionary uses a trie to find a character to replace. Each node of
27-
* the trie is an array of 256 TrieChar structs (n-th element of array
28-
* corresponds to byte)
26+
* An unaccent dictionary uses a trie to find a string to replace. Each node
27+
* of the trie is an array of 256 TrieChar structs; the N-th element of the
28+
* array corresponds to next byte value N. That element can contain both a
29+
* replacement string (to be used if the source string ends with this byte)
30+
* and a link to another trie node (to be followed if there are more bytes).
31+
*
32+
* Note that the trie search logic pays no attention to multibyte character
33+
* boundaries. This is OK as long as both the data entered into the trie and
34+
* the data we're trying to look up are validly encoded; no partial-character
35+
* matches will occur.
2936
*/
3037
typedef struct TrieChar
3138
{
@@ -36,34 +43,38 @@ typedef struct TrieChar
3643

3744
/*
3845
* placeChar - put str into trie's structure, byte by byte.
46+
*
47+
* If node is NULL, we need to make a new node, which will be returned;
48+
* otherwise the return value is the same as node.
3949
*/
4050
static TrieChar *
41-
placeChar(TrieChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
51+
placeChar(TrieChar *node, const unsigned char *str, int lenstr,
52+
const char *replaceTo, int replacelen)
4253
{
4354
TrieChar *curnode;
4455

4556
if (!node)
46-
{
47-
node = palloc(sizeof(TrieChar) * 256);
48-
memset(node, 0, sizeof(TrieChar) * 256);
49-
}
57+
node = (TrieChar *) palloc0(sizeof(TrieChar) * 256);
58+
59+
Assert(lenstr > 0); /* else str[0] doesn't exist */
5060

5161
curnode = node + *str;
5262

53-
if (lenstr == 1)
63+
if (lenstr <= 1)
5464
{
5565
if (curnode->replaceTo)
56-
elog(WARNING, "duplicate TO argument, use first one");
66+
elog(WARNING, "duplicate source strings, first one will be used");
5767
else
5868
{
5969
curnode->replacelen = replacelen;
60-
curnode->replaceTo = palloc(replacelen);
70+
curnode->replaceTo = (char *) palloc(replacelen);
6171
memcpy(curnode->replaceTo, replaceTo, replacelen);
6272
}
6373
}
6474
else
6575
{
66-
curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
76+
curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1,
77+
replaceTo, replacelen);
6778
}
6879

6980
return node;
@@ -213,23 +224,35 @@ initTrie(char *filename)
213224
}
214225

215226
/*
216-
* findReplaceTo - find multibyte character in trie
227+
* findReplaceTo - find longest possible match in trie
228+
*
229+
* On success, returns pointer to ending subnode, plus length of matched
230+
* source string in *p_matchlen. On failure, returns NULL.
217231
*/
218232
static TrieChar *
219-
findReplaceTo(TrieChar *node, unsigned char *src, int srclen)
233+
findReplaceTo(TrieChar *node, const unsigned char *src, int srclen,
234+
int *p_matchlen)
220235
{
221-
while (node)
236+
TrieChar *result = NULL;
237+
int matchlen = 0;
238+
239+
*p_matchlen = 0; /* prevent uninitialized-variable warnings */
240+
241+
while (node && matchlen < srclen)
222242
{
223-
node = node + *src;
224-
if (srclen == 1)
225-
return node;
243+
node = node + src[matchlen];
244+
matchlen++;
245+
246+
if (node->replaceTo)
247+
{
248+
result = node;
249+
*p_matchlen = matchlen;
250+
}
226251

227-
src++;
228-
srclen--;
229252
node = node->nextChar;
230253
}
231254

232-
return NULL;
255+
return result;
233256
}
234257

235258
PG_FUNCTION_INFO_V1(unaccent_init);
@@ -280,18 +303,17 @@ unaccent_lexize(PG_FUNCTION_ARGS)
280303
TrieChar *rootTrie = (TrieChar *) PG_GETARG_POINTER(0);
281304
char *srcchar = (char *) PG_GETARG_POINTER(1);
282305
int32 len = PG_GETARG_INT32(2);
283-
char *srcstart,
306+
char *srcstart = srcchar,
284307
*trgchar = NULL;
285-
int charlen;
286308
TSLexeme *res = NULL;
287-
TrieChar *node;
288309

289-
srcstart = srcchar;
290-
while (srcchar - srcstart < len)
310+
while (len > 0)
291311
{
292-
charlen = pg_mblen(srcchar);
312+
TrieChar *node;
313+
int matchlen;
293314

294-
node = findReplaceTo(rootTrie, (unsigned char *) srcchar, charlen);
315+
node = findReplaceTo(rootTrie, (unsigned char *) srcchar, len,
316+
&matchlen);
295317
if (node && node->replaceTo)
296318
{
297319
if (!res)
@@ -309,13 +331,18 @@ unaccent_lexize(PG_FUNCTION_ARGS)
309331
memcpy(trgchar, node->replaceTo, node->replacelen);
310332
trgchar += node->replacelen;
311333
}
312-
else if (res)
334+
else
313335
{
314-
memcpy(trgchar, srcchar, charlen);
315-
trgchar += charlen;
336+
matchlen = pg_mblen(srcchar);
337+
if (res)
338+
{
339+
memcpy(trgchar, srcchar, matchlen);
340+
trgchar += matchlen;
341+
}
316342
}
317343

318-
srcchar += charlen;
344+
srcchar += matchlen;
345+
len -= matchlen;
319346
}
320347

321348
if (res)

doc/src/sgml/unaccent.sgml

+8
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@
7070
</para>
7171
</listitem>
7272

73+
<listitem>
74+
<para>
75+
Actually, each <quote>character</> can be any string not containing
76+
whitespace, so <filename>unaccent</> dictionaries could be used for
77+
other sorts of substring substitutions besides diacritic removal.
78+
</para>
79+
</listitem>
80+
7381
<listitem>
7482
<para>
7583
As with other <productname>PostgreSQL</> text search configuration files,

0 commit comments

Comments
 (0)