9
9
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
10
10
* ALL RIGHTS RESERVED;
11
11
*
12
- * levenshtein()
13
- * -------------
14
- * Written based on a description of the algorithm by Michael Gilleland
15
- * found at http://www.merriampark.com/ld.htm
16
- * Also looked at levenshtein.c in the PHP 4.0.6 distribution for
17
- * inspiration.
18
- * Configurable penalty costs extension is introduced by Volkan
19
- * YAZICI <volkan.yazici@gmail.com>.
20
- *
21
12
* metaphone()
22
13
* -----------
23
14
* Modified for PostgreSQL by Joe Conway.
@@ -61,6 +52,8 @@ PG_MODULE_MAGIC;
61
52
*/
62
53
extern Datum levenshtein_with_costs (PG_FUNCTION_ARGS );
63
54
extern Datum levenshtein (PG_FUNCTION_ARGS );
55
+ extern Datum levenshtein_less_equal_with_costs (PG_FUNCTION_ARGS );
56
+ extern Datum levenshtein_less_equal (PG_FUNCTION_ARGS );
64
57
extern Datum metaphone (PG_FUNCTION_ARGS );
65
58
extern Datum soundex (PG_FUNCTION_ARGS );
66
59
extern Datum difference (PG_FUNCTION_ARGS );
@@ -85,16 +78,6 @@ soundex_code(char letter)
85
78
return letter ;
86
79
}
87
80
88
-
89
- /*
90
- * Levenshtein
91
- */
92
- #define MAX_LEVENSHTEIN_STRLEN 255
93
-
94
- static int levenshtein_internal (text * s , text * t ,
95
- int ins_c , int del_c , int sub_c );
96
-
97
-
98
81
/*
99
82
* Metaphone
100
83
*/
@@ -197,224 +180,59 @@ rest_of_char_same(const char *s1, const char *s2, int len)
197
180
return true;
198
181
}
199
182
200
- /*
201
- * levenshtein_internal - Calculates Levenshtein distance metric
202
- * between supplied strings. Generally
203
- * (1, 1, 1) penalty costs suffices common
204
- * cases, but your mileage may vary.
205
- */
206
- static int
207
- levenshtein_internal (text * s , text * t ,
208
- int ins_c , int del_c , int sub_c )
209
- {
210
- int m ,
211
- n ,
212
- s_bytes ,
213
- t_bytes ;
214
- int * prev ;
215
- int * curr ;
216
- int * s_char_len = NULL ;
217
- int i ,
218
- j ;
219
- const char * s_data ;
220
- const char * t_data ;
221
- const char * y ;
222
-
223
- /* Extract a pointer to the actual character data. */
224
- s_data = VARDATA_ANY (s );
225
- t_data = VARDATA_ANY (t );
226
-
227
- /* Determine length of each string in bytes and characters. */
228
- s_bytes = VARSIZE_ANY_EXHDR (s );
229
- t_bytes = VARSIZE_ANY_EXHDR (t );
230
- m = pg_mbstrlen_with_len (s_data , s_bytes );
231
- n = pg_mbstrlen_with_len (t_data , t_bytes );
232
-
233
- /*
234
- * We can transform an empty s into t with n insertions, or a non-empty t
235
- * into an empty s with m deletions.
236
- */
237
- if (!m )
238
- return n * ins_c ;
239
- if (!n )
240
- return m * del_c ;
241
-
242
- /*
243
- * For security concerns, restrict excessive CPU+RAM usage. (This
244
- * implementation uses O(m) memory and has O(mn) complexity.)
245
- */
246
- if (m > MAX_LEVENSHTEIN_STRLEN ||
247
- n > MAX_LEVENSHTEIN_STRLEN )
248
- ereport (ERROR ,
249
- (errcode (ERRCODE_INVALID_PARAMETER_VALUE ),
250
- errmsg ("argument exceeds the maximum length of %d bytes" ,
251
- MAX_LEVENSHTEIN_STRLEN )));
252
-
253
- /*
254
- * In order to avoid calling pg_mblen() repeatedly on each character in s,
255
- * we cache all the lengths before starting the main loop -- but if all the
256
- * characters in both strings are single byte, then we skip this and use
257
- * a fast-path in the main loop. If only one string contains multi-byte
258
- * characters, we still build the array, so that the fast-path needn't
259
- * deal with the case where the array hasn't been initialized.
260
- */
261
- if (m != s_bytes || n != t_bytes )
262
- {
263
- int i ;
264
- const char * cp = s_data ;
265
-
266
- s_char_len = (int * ) palloc ((m + 1 ) * sizeof (int ));
267
- for (i = 0 ; i < m ; ++ i )
268
- {
269
- s_char_len [i ] = pg_mblen (cp );
270
- cp += s_char_len [i ];
271
- }
272
- s_char_len [i ] = 0 ;
273
- }
274
-
275
- /* One more cell for initialization column and row. */
276
- ++ m ;
277
- ++ n ;
278
-
279
- /*
280
- * One way to compute Levenshtein distance is to incrementally construct
281
- * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
282
- * of operations required to transform the first i characters of s into
283
- * the first j characters of t. The last column of the final row is the
284
- * answer.
285
- *
286
- * We use that algorithm here with some modification. In lieu of holding
287
- * the entire array in memory at once, we'll just use two arrays of size
288
- * m+1 for storing accumulated values. At each step one array represents
289
- * the "previous" row and one is the "current" row of the notional large
290
- * array.
291
- */
292
- prev = (int * ) palloc (2 * m * sizeof (int ));
293
- curr = prev + m ;
294
-
295
- /*
296
- * To transform the first i characters of s into the first 0 characters
297
- * of t, we must perform i deletions.
298
- */
299
- for (i = 0 ; i < m ; i ++ )
300
- prev [i ] = i * del_c ;
301
-
302
- /* Loop through rows of the notional array */
303
- for (y = t_data , j = 1 ; j < n ; j ++ )
304
- {
305
- int * temp ;
306
- const char * x = s_data ;
307
- int y_char_len = n != t_bytes + 1 ? pg_mblen (y ) : 1 ;
308
-
309
- /*
310
- * To transform the first 0 characters of s into the first j
311
- * characters of t, we must perform j insertions.
312
- */
313
- curr [0 ] = j * ins_c ;
314
-
315
- /*
316
- * This inner loop is critical to performance, so we include a
317
- * fast-path to handle the (fairly common) case where no multibyte
318
- * characters are in the mix. The fast-path is entitled to assume
319
- * that if s_char_len is not initialized then BOTH strings contain
320
- * only single-byte characters.
321
- */
322
- if (s_char_len != NULL )
323
- {
324
- for (i = 1 ; i < m ; i ++ )
325
- {
326
- int ins ;
327
- int del ;
328
- int sub ;
329
- int x_char_len = s_char_len [i - 1 ];
330
-
331
- /*
332
- * Calculate costs for insertion, deletion, and substitution.
333
- *
334
- * When calculating cost for substitution, we compare the last
335
- * character of each possibly-multibyte character first,
336
- * because that's enough to rule out most mis-matches. If we
337
- * get past that test, then we compare the lengths and the
338
- * remaining bytes.
339
- */
340
- ins = prev [i ] + ins_c ;
341
- del = curr [i - 1 ] + del_c ;
342
- if (x [x_char_len - 1 ] == y [y_char_len - 1 ]
343
- && x_char_len == y_char_len &&
344
- (x_char_len == 1 || rest_of_char_same (x , y , x_char_len )))
345
- sub = prev [i - 1 ];
346
- else
347
- sub = prev [i - 1 ] + sub_c ;
348
-
349
- /* Take the one with minimum cost. */
350
- curr [i ] = Min (ins , del );
351
- curr [i ] = Min (curr [i ], sub );
352
-
353
- /* Point to next character. */
354
- x += x_char_len ;
355
- }
356
- }
357
- else
358
- {
359
- for (i = 1 ; i < m ; i ++ )
360
- {
361
- int ins ;
362
- int del ;
363
- int sub ;
183
+ #include "levenshtein.c"
184
+ #define LEVENSHTEIN_LESS_EQUAL
185
+ #include "levenshtein.c"
364
186
365
- /* Calculate costs for insertion, deletion, and substitution. */
366
- ins = prev [i ] + ins_c ;
367
- del = curr [i - 1 ] + del_c ;
368
- sub = prev [i - 1 ] + ((* x == * y ) ? 0 : sub_c );
369
-
370
- /* Take the one with minimum cost. */
371
- curr [i ] = Min (ins , del );
372
- curr [i ] = Min (curr [i ], sub );
187
+ PG_FUNCTION_INFO_V1 (levenshtein_with_costs );
188
+ Datum
189
+ levenshtein_with_costs (PG_FUNCTION_ARGS )
190
+ {
191
+ text * src = PG_GETARG_TEXT_PP (0 );
192
+ text * dst = PG_GETARG_TEXT_PP (1 );
193
+ int ins_c = PG_GETARG_INT32 (2 );
194
+ int del_c = PG_GETARG_INT32 (3 );
195
+ int sub_c = PG_GETARG_INT32 (4 );
373
196
374
- /* Point to next character. */
375
- x ++ ;
376
- }
377
- }
197
+ PG_RETURN_INT32 (levenshtein_internal (src , dst , ins_c , del_c , sub_c ));
198
+ }
378
199
379
- /* Swap current row with previous row. */
380
- temp = curr ;
381
- curr = prev ;
382
- prev = temp ;
383
200
384
- /* Point to next character. */
385
- y += y_char_len ;
386
- }
201
+ PG_FUNCTION_INFO_V1 (levenshtein );
202
+ Datum
203
+ levenshtein (PG_FUNCTION_ARGS )
204
+ {
205
+ text * src = PG_GETARG_TEXT_PP (0 );
206
+ text * dst = PG_GETARG_TEXT_PP (1 );
387
207
388
- /*
389
- * Because the final value was swapped from the previous row to the
390
- * current row, that's where we'll find it.
391
- */
392
- return prev [m - 1 ];
208
+ PG_RETURN_INT32 (levenshtein_internal (src , dst , 1 , 1 , 1 ));
393
209
}
394
210
395
211
396
- PG_FUNCTION_INFO_V1 (levenshtein_with_costs );
212
+ PG_FUNCTION_INFO_V1 (levenshtein_less_equal_with_costs );
397
213
Datum
398
- levenshtein_with_costs (PG_FUNCTION_ARGS )
214
+ levenshtein_less_equal_with_costs (PG_FUNCTION_ARGS )
399
215
{
400
216
text * src = PG_GETARG_TEXT_PP (0 );
401
217
text * dst = PG_GETARG_TEXT_PP (1 );
402
218
int ins_c = PG_GETARG_INT32 (2 );
403
219
int del_c = PG_GETARG_INT32 (3 );
404
220
int sub_c = PG_GETARG_INT32 (4 );
221
+ int max_d = PG_GETARG_INT32 (5 );
405
222
406
- PG_RETURN_INT32 (levenshtein_internal (src , dst , ins_c , del_c , sub_c ));
223
+ PG_RETURN_INT32 (levenshtein_less_equal_internal (src , dst , ins_c , del_c , sub_c , max_d ));
407
224
}
408
225
409
226
410
- PG_FUNCTION_INFO_V1 (levenshtein );
227
+ PG_FUNCTION_INFO_V1 (levenshtein_less_equal );
411
228
Datum
412
- levenshtein (PG_FUNCTION_ARGS )
229
+ levenshtein_less_equal (PG_FUNCTION_ARGS )
413
230
{
414
231
text * src = PG_GETARG_TEXT_PP (0 );
415
232
text * dst = PG_GETARG_TEXT_PP (1 );
233
+ int max_d = PG_GETARG_INT32 (2 );
416
234
417
- PG_RETURN_INT32 (levenshtein_internal (src , dst , 1 , 1 , 1 ));
235
+ PG_RETURN_INT32 (levenshtein_less_equal_internal (src , dst , 1 , 1 , 1 , max_d ));
418
236
}
419
237
420
238
0 commit comments