23
23
PG_MODULE_MAGIC ;
24
24
25
25
/*
26
- * Unaccent dictionary uses a trie to find a character to replace. Each node of
27
- * the trie is an array of 256 TrieChar structs (n-th element of array
28
- * corresponds to byte)
26
+ * An unaccent dictionary uses a trie to find a string to replace. Each node
27
+ * of the trie is an array of 256 TrieChar structs; the N-th element of the
28
+ * array corresponds to next byte value N. That element can contain both a
29
+ * replacement string (to be used if the source string ends with this byte)
30
+ * and a link to another trie node (to be followed if there are more bytes).
31
+ *
32
+ * Note that the trie search logic pays no attention to multibyte character
33
+ * boundaries. This is OK as long as both the data entered into the trie and
34
+ * the data we're trying to look up are validly encoded; no partial-character
35
+ * matches will occur.
29
36
*/
30
37
typedef struct TrieChar
31
38
{
@@ -36,34 +43,38 @@ typedef struct TrieChar
36
43
37
44
/*
38
45
* placeChar - put str into trie's structure, byte by byte.
46
+ *
47
+ * If node is NULL, we need to make a new node, which will be returned;
48
+ * otherwise the return value is the same as node.
39
49
*/
40
50
static TrieChar *
41
- placeChar (TrieChar * node , unsigned char * str , int lenstr , char * replaceTo , int replacelen )
51
+ placeChar (TrieChar * node , const unsigned char * str , int lenstr ,
52
+ const char * replaceTo , int replacelen )
42
53
{
43
54
TrieChar * curnode ;
44
55
45
56
if (!node )
46
- {
47
- node = palloc (sizeof (TrieChar ) * 256 );
48
- memset (node , 0 , sizeof (TrieChar ) * 256 );
49
- }
57
+ node = (TrieChar * ) palloc0 (sizeof (TrieChar ) * 256 );
58
+
59
+ Assert (lenstr > 0 ); /* else str[0] doesn't exist */
50
60
51
61
curnode = node + * str ;
52
62
53
- if (lenstr = = 1 )
63
+ if (lenstr < = 1 )
54
64
{
55
65
if (curnode -> replaceTo )
56
- elog (WARNING , "duplicate TO argument, use first one" );
66
+ elog (WARNING , "duplicate source strings, first one will be used " );
57
67
else
58
68
{
59
69
curnode -> replacelen = replacelen ;
60
- curnode -> replaceTo = palloc (replacelen );
70
+ curnode -> replaceTo = ( char * ) palloc (replacelen );
61
71
memcpy (curnode -> replaceTo , replaceTo , replacelen );
62
72
}
63
73
}
64
74
else
65
75
{
66
- curnode -> nextChar = placeChar (curnode -> nextChar , str + 1 , lenstr - 1 , replaceTo , replacelen );
76
+ curnode -> nextChar = placeChar (curnode -> nextChar , str + 1 , lenstr - 1 ,
77
+ replaceTo , replacelen );
67
78
}
68
79
69
80
return node ;
@@ -213,23 +224,35 @@ initTrie(char *filename)
213
224
}
214
225
215
226
/*
216
- * findReplaceTo - find multibyte character in trie
227
+ * findReplaceTo - find longest possible match in trie
228
+ *
229
+ * On success, returns pointer to ending subnode, plus length of matched
230
+ * source string in *p_matchlen. On failure, returns NULL.
217
231
*/
218
232
static TrieChar *
219
- findReplaceTo (TrieChar * node , unsigned char * src , int srclen )
233
+ findReplaceTo (TrieChar * node , const unsigned char * src , int srclen ,
234
+ int * p_matchlen )
220
235
{
221
- while (node )
236
+ TrieChar * result = NULL ;
237
+ int matchlen = 0 ;
238
+
239
+ * p_matchlen = 0 ; /* prevent uninitialized-variable warnings */
240
+
241
+ while (node && matchlen < srclen )
222
242
{
223
- node = node + * src ;
224
- if (srclen == 1 )
225
- return node ;
243
+ node = node + src [matchlen ];
244
+ matchlen ++ ;
245
+
246
+ if (node -> replaceTo )
247
+ {
248
+ result = node ;
249
+ * p_matchlen = matchlen ;
250
+ }
226
251
227
- src ++ ;
228
- srclen -- ;
229
252
node = node -> nextChar ;
230
253
}
231
254
232
- return NULL ;
255
+ return result ;
233
256
}
234
257
235
258
PG_FUNCTION_INFO_V1 (unaccent_init );
@@ -280,18 +303,17 @@ unaccent_lexize(PG_FUNCTION_ARGS)
280
303
TrieChar * rootTrie = (TrieChar * ) PG_GETARG_POINTER (0 );
281
304
char * srcchar = (char * ) PG_GETARG_POINTER (1 );
282
305
int32 len = PG_GETARG_INT32 (2 );
283
- char * srcstart ,
306
+ char * srcstart = srcchar ,
284
307
* trgchar = NULL ;
285
- int charlen ;
286
308
TSLexeme * res = NULL ;
287
- TrieChar * node ;
288
309
289
- srcstart = srcchar ;
290
- while (srcchar - srcstart < len )
310
+ while (len > 0 )
291
311
{
292
- charlen = pg_mblen (srcchar );
312
+ TrieChar * node ;
313
+ int matchlen ;
293
314
294
- node = findReplaceTo (rootTrie , (unsigned char * ) srcchar , charlen );
315
+ node = findReplaceTo (rootTrie , (unsigned char * ) srcchar , len ,
316
+ & matchlen );
295
317
if (node && node -> replaceTo )
296
318
{
297
319
if (!res )
@@ -309,13 +331,18 @@ unaccent_lexize(PG_FUNCTION_ARGS)
309
331
memcpy (trgchar , node -> replaceTo , node -> replacelen );
310
332
trgchar += node -> replacelen ;
311
333
}
312
- else if ( res )
334
+ else
313
335
{
314
- memcpy (trgchar , srcchar , charlen );
315
- trgchar += charlen ;
336
+ matchlen = pg_mblen (srcchar );
337
+ if (res )
338
+ {
339
+ memcpy (trgchar , srcchar , matchlen );
340
+ trgchar += matchlen ;
341
+ }
316
342
}
317
343
318
- srcchar += charlen ;
344
+ srcchar += matchlen ;
345
+ len -= matchlen ;
319
346
}
320
347
321
348
if (res )
0 commit comments