1
1
/*
2
- * $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.10 2008/05/17 01:28:21 adunstan Exp $
2
+ * $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.11 2008/11/12 13:43:54 teodor Exp $
3
3
*/
4
4
#include "trgm.h"
5
5
#include <ctype.h>
6
6
#include "utils/array.h"
7
7
#include "catalog/pg_type.h"
8
+ #include "tsearch/ts_locale.h"
8
9
9
10
PG_MODULE_MAGIC ;
10
11
@@ -31,9 +32,6 @@ show_limit(PG_FUNCTION_ARGS)
31
32
PG_RETURN_FLOAT4 (trgm_limit );
32
33
}
33
34
34
- #define WORDWAIT 0
35
- #define INWORD 1
36
-
37
35
static int
38
36
comp_trgm (const void * a , const void * b )
39
37
{
@@ -60,18 +58,119 @@ unique_array(trgm * a, int len)
60
58
return curend + 1 - a ;
61
59
}
62
60
61
+ #ifdef KEEPONLYALNUM
62
+ #define iswordchr (c ) (t_isalpha(c) || t_isdigit(c))
63
+ #else
64
+ #define iswordchr (c ) (!t_isspace(c))
65
+ #endif
66
+
67
+ /*
68
+ * Finds first word in string, returns pointer to the word,
69
+ * endword points to the character after word
70
+ */
71
+ static char *
72
+ find_word (char * str , int lenstr , char * * endword , int * charlen )
73
+ {
74
+ char * beginword = str ;
75
+
76
+ while ( beginword - str < lenstr && !iswordchr (beginword ) )
77
+ beginword += pg_mblen (beginword );
78
+
79
+ if (beginword - str >= lenstr )
80
+ return NULL ;
81
+
82
+ * endword = beginword ;
83
+ * charlen = 0 ;
84
+ while ( * endword - str < lenstr && iswordchr (* endword ) )
85
+ {
86
+ * endword += pg_mblen (* endword );
87
+ (* charlen )++ ;
88
+ }
89
+
90
+ return beginword ;
91
+ }
92
+
93
+ #ifdef USE_WIDE_UPPER_LOWER
94
+ static void
95
+ cnt_trigram (trgm * tptr , char * str , int bytelen )
96
+ {
97
+ if ( bytelen == 3 )
98
+ {
99
+ CPTRGM (tptr , str );
100
+ }
101
+ else
102
+ {
103
+ pg_crc32 crc ;
104
+
105
+ INIT_CRC32 (crc );
106
+ COMP_CRC32 (crc , str , bytelen );
107
+ FIN_CRC32 (crc );
108
+
109
+ /*
110
+ * use only 3 upper bytes from crc, hope, it's
111
+ * good enough hashing
112
+ */
113
+ CPTRGM (tptr , & crc );
114
+ }
115
+ }
116
+ #endif
117
+
118
+ /*
119
+ * Adds trigramm from words (already padded).
120
+ */
121
+ static trgm *
122
+ make_trigrams ( trgm * tptr , char * str , int bytelen , int charlen )
123
+ {
124
+ char * ptr = str ;
125
+
126
+ if ( charlen < 3 )
127
+ return tptr ;
128
+
129
+ #ifdef USE_WIDE_UPPER_LOWER
130
+ if (pg_database_encoding_max_length () > 1 )
131
+ {
132
+ int lenfirst = pg_mblen (str ),
133
+ lenmiddle = pg_mblen (str + lenfirst ),
134
+ lenlast = pg_mblen (str + lenfirst + lenmiddle );
135
+
136
+ while ( (ptr - str ) + lenfirst + lenmiddle + lenlast <= bytelen )
137
+ {
138
+ cnt_trigram (tptr , ptr , lenfirst + lenmiddle + lenlast );
139
+
140
+ ptr += lenfirst ;
141
+ tptr ++ ;
142
+
143
+ lenfirst = lenmiddle ;
144
+ lenmiddle = lenlast ;
145
+ lenlast = pg_mblen (ptr + lenfirst + lenmiddle );
146
+ }
147
+ }
148
+ else
149
+ #endif
150
+ {
151
+ Assert ( bytelen == charlen );
152
+
153
+ while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
154
+ {
155
+ CPTRGM (tptr , ptr );
156
+ ptr ++ ;
157
+ tptr ++ ;
158
+ }
159
+ }
160
+
161
+ return tptr ;
162
+ }
63
163
64
164
TRGM *
65
165
generate_trgm (char * str , int slen )
66
166
{
67
167
TRGM * trg ;
68
- char * buf ,
69
- * sptr ,
70
- * bufptr ;
168
+ char * buf ;
71
169
trgm * tptr ;
72
- int state = WORDWAIT ;
73
- int wl ,
74
- len ;
170
+ int len ,
171
+ charlen ,
172
+ bytelen ;
173
+ char * bword , * eword ;
75
174
76
175
trg = (TRGM * ) palloc (TRGMHDRSIZE + sizeof (trgm ) * (slen / 2 + 1 ) * 3 );
77
176
trg -> flag = ARRKEY ;
@@ -83,7 +182,6 @@ generate_trgm(char *str, int slen)
83
182
tptr = GETARR (trg );
84
183
85
184
buf = palloc (sizeof (char ) * (slen + 4 ));
86
- sptr = str ;
87
185
88
186
if (LPADDING > 0 )
89
187
{
@@ -92,82 +190,29 @@ generate_trgm(char *str, int slen)
92
190
* (buf + 1 ) = ' ' ;
93
191
}
94
192
95
- bufptr = buf + LPADDING ;
96
- while ( sptr - str < slen )
193
+ eword = str ;
194
+ while ( ( bword = find_word ( eword , slen - ( eword - str ), & eword , & charlen )) != NULL )
97
195
{
98
- if (state == WORDWAIT )
99
- {
100
- if (
101
- #ifdef KEEPONLYALNUM
102
- isalnum ((unsigned char ) * sptr )
103
- #else
104
- !isspace ((unsigned char ) * sptr )
105
- #endif
106
- )
107
- {
108
- * bufptr = * sptr ; /* start put word in buffer */
109
- bufptr ++ ;
110
- state = INWORD ;
111
- if (sptr - str == slen - 1 /* last char */ )
112
- goto gettrg ;
113
- }
114
- }
115
- else
116
- {
117
- if (
118
- #ifdef KEEPONLYALNUM
119
- !isalnum ((unsigned char ) * sptr )
196
+ #ifdef IGNORECASE
197
+ bword = lowerstr_with_len (bword , eword - bword );
198
+ bytelen = strlen (bword );
120
199
#else
121
- isspace (( unsigned char ) * sptr )
200
+ bytelen = eword - bword ;
122
201
#endif
123
- )
124
- {
125
- gettrg :
126
- /* word in buffer, so count trigrams */
127
- * bufptr = ' ' ;
128
- * (bufptr + 1 ) = ' ' ;
129
- wl = bufptr - (buf + LPADDING ) - 2 + LPADDING + RPADDING ;
130
- if (wl <= 0 )
131
- {
132
- bufptr = buf + LPADDING ;
133
- state = WORDWAIT ;
134
- sptr ++ ;
135
- continue ;
136
- }
202
+
203
+ memcpy (buf + LPADDING , bword , bytelen );
137
204
138
205
#ifdef IGNORECASE
139
- do
140
- { /* lower word */
141
- int wwl = bufptr - buf ;
142
-
143
- bufptr = buf + LPADDING ;
144
- while (bufptr - buf < wwl )
145
- {
146
- * bufptr = tolower ((unsigned char ) * bufptr );
147
- bufptr ++ ;
148
- }
149
- } while (0 );
206
+ pfree (bword );
150
207
#endif
151
- bufptr = buf ;
152
- /* set trigrams */
153
- while (bufptr - buf < wl )
154
- {
155
- CPTRGM (tptr , bufptr );
156
- bufptr ++ ;
157
- tptr ++ ;
158
- }
159
- bufptr = buf + LPADDING ;
160
- state = WORDWAIT ;
161
- }
162
- else
163
- {
164
- * bufptr = * sptr ; /* put in buffer */
165
- bufptr ++ ;
166
- if (sptr - str == slen - 1 )
167
- goto gettrg ;
168
- }
169
- }
170
- sptr ++ ;
208
+ buf [LPADDING + bytelen ] = ' ' ;
209
+ buf [LPADDING + bytelen + 1 ] = ' ' ;
210
+
211
+ /*
212
+ * count trigrams
213
+ */
214
+ tptr = make_trigrams ( tptr , buf , bytelen + LPADDING + RPADDING ,
215
+ charlen + LPADDING + RPADDING );
171
216
}
172
217
173
218
pfree (buf );
@@ -186,6 +231,19 @@ generate_trgm(char *str, int slen)
186
231
return trg ;
187
232
}
188
233
234
+ uint32
235
+ trgm2int (trgm * ptr )
236
+ {
237
+ uint32 val = 0 ;
238
+
239
+ val |= * ( ((unsigned char * )ptr ) );
240
+ val <<= 8 ;
241
+ val |= * ( ((unsigned char * )ptr ) + 1 );
242
+ val <<= 8 ;
243
+ val |= * ( ((unsigned char * )ptr ) + 2 );
244
+
245
+ return val ;
246
+ }
189
247
190
248
PG_FUNCTION_INFO_V1 (show_trgm );
191
249
Datum show_trgm (PG_FUNCTION_ARGS );
@@ -204,10 +262,18 @@ show_trgm(PG_FUNCTION_ARGS)
204
262
205
263
for (i = 0 , ptr = GETARR (trg ); i < ARRNELEM (trg ); i ++ , ptr ++ )
206
264
{
207
- text * item = (text * ) palloc (VARHDRSZ + 3 );
265
+ text * item = (text * ) palloc (VARHDRSZ + Max ( 12 , pg_database_encoding_max_length () * 3 ) );
208
266
209
- SET_VARSIZE (item , VARHDRSZ + 3 );
210
- CPTRGM (VARDATA (item ), ptr );
267
+ if ( pg_database_encoding_max_length () > 1 && !ISPRINTABLETRGM (ptr ) )
268
+ {
269
+ snprintf (VARDATA (item ), 12 , "0x%06x" , trgm2int (ptr ));
270
+ SET_VARSIZE (item , VARHDRSZ + strlen (VARDATA (item )));
271
+ }
272
+ else
273
+ {
274
+ SET_VARSIZE (item , VARHDRSZ + 3 );
275
+ CPTRGM (VARDATA (item ), ptr );
276
+ }
211
277
d [i ] = PointerGetDatum (item );
212
278
}
213
279
0 commit comments