Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit b87b52b

Browse files
committed
Support of multibyte encoding for pg_trgm
1 parent e4ffd14 commit b87b52b

File tree

3 files changed

+161
-88
lines changed

3 files changed

+161
-88
lines changed

contrib/pg_trgm/trgm.h

+9-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.9 2008/05/17 01:28:21 adunstan Exp $
2+
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.10 2008/11/12 13:43:54 teodor Exp $
33
*/
44
#ifndef __TRGM_H__
55
#define __TRGM_H__
@@ -31,7 +31,14 @@ typedef char trgm[3];
3131
*(((char*)(a))+2) = *(((char*)(b))+2); \
3232
} while(0);
3333

34-
#define TRGMINT(a) ( (*(((char*)(a))+2)<<16)+(*(((char*)(a))+1)<<8)+*(((char*)(a))+0) )
34+
uint32 trgm2int(trgm *ptr);
35+
36+
#ifdef KEEPONLYALNUM
37+
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
38+
#else
39+
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
40+
#endif
41+
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
3542

3643
typedef struct
3744
{

contrib/pg_trgm/trgm_gin.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.5 2008/07/11 11:56:48 teodor Exp $
2+
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.6 2008/11/12 13:43:54 teodor Exp $
33
*/
44
#include "trgm.h"
55

@@ -42,7 +42,7 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
4242
ptr = GETARR(trg);
4343
while (ptr - GETARR(trg) < ARRNELEM(trg))
4444
{
45-
item = TRGMINT(ptr);
45+
item = trgm2int(ptr);
4646
entries[i++] = Int32GetDatum(item);
4747

4848
ptr++;

contrib/pg_trgm/trgm_op.c

+150-84
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
/*
2-
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.10 2008/05/17 01:28:21 adunstan Exp $
2+
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.11 2008/11/12 13:43:54 teodor Exp $
33
*/
44
#include "trgm.h"
55
#include <ctype.h>
66
#include "utils/array.h"
77
#include "catalog/pg_type.h"
8+
#include "tsearch/ts_locale.h"
89

910
PG_MODULE_MAGIC;
1011

@@ -31,9 +32,6 @@ show_limit(PG_FUNCTION_ARGS)
3132
PG_RETURN_FLOAT4(trgm_limit);
3233
}
3334

34-
#define WORDWAIT 0
35-
#define INWORD 1
36-
3735
static int
3836
comp_trgm(const void *a, const void *b)
3937
{
@@ -60,18 +58,119 @@ unique_array(trgm * a, int len)
6058
return curend + 1 - a;
6159
}
6260

61+
#ifdef KEEPONLYALNUM
62+
#define iswordchr(c) (t_isalpha(c) || t_isdigit(c))
63+
#else
64+
#define iswordchr(c) (!t_isspace(c))
65+
#endif
66+
67+
/*
68+
* Finds first word in string, returns pointer to the word,
69+
* endword points to the character after word
70+
*/
71+
static char*
72+
find_word(char *str, int lenstr, char **endword, int *charlen)
73+
{
74+
char *beginword = str;
75+
76+
while( beginword - str < lenstr && !iswordchr(beginword) )
77+
beginword += pg_mblen(beginword);
78+
79+
if (beginword - str >= lenstr)
80+
return NULL;
81+
82+
*endword = beginword;
83+
*charlen = 0;
84+
while( *endword - str < lenstr && iswordchr(*endword) )
85+
{
86+
*endword += pg_mblen(*endword);
87+
(*charlen)++;
88+
}
89+
90+
return beginword;
91+
}
92+
93+
#ifdef USE_WIDE_UPPER_LOWER
94+
static void
95+
cnt_trigram(trgm *tptr, char *str, int bytelen)
96+
{
97+
if ( bytelen == 3 )
98+
{
99+
CPTRGM(tptr, str);
100+
}
101+
else
102+
{
103+
pg_crc32 crc;
104+
105+
INIT_CRC32(crc);
106+
COMP_CRC32(crc, str, bytelen);
107+
FIN_CRC32(crc);
108+
109+
/*
110+
* use only 3 upper bytes from crc, hope, it's
111+
* good enough hashing
112+
*/
113+
CPTRGM(tptr, &crc);
114+
}
115+
}
116+
#endif
117+
118+
/*
119+
* Adds trigramm from words (already padded).
120+
*/
121+
static trgm*
122+
make_trigrams( trgm *tptr, char *str, int bytelen, int charlen )
123+
{
124+
char *ptr = str;
125+
126+
if ( charlen < 3 )
127+
return tptr;
128+
129+
#ifdef USE_WIDE_UPPER_LOWER
130+
if (pg_database_encoding_max_length() > 1)
131+
{
132+
int lenfirst = pg_mblen(str),
133+
lenmiddle = pg_mblen(str + lenfirst),
134+
lenlast = pg_mblen(str + lenfirst + lenmiddle);
135+
136+
while( (ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen )
137+
{
138+
cnt_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
139+
140+
ptr += lenfirst;
141+
tptr++;
142+
143+
lenfirst = lenmiddle;
144+
lenmiddle = lenlast;
145+
lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
146+
}
147+
}
148+
else
149+
#endif
150+
{
151+
Assert( bytelen == charlen );
152+
153+
while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
154+
{
155+
CPTRGM(tptr, ptr);
156+
ptr++;
157+
tptr++;
158+
}
159+
}
160+
161+
return tptr;
162+
}
63163

64164
TRGM *
65165
generate_trgm(char *str, int slen)
66166
{
67167
TRGM *trg;
68-
char *buf,
69-
*sptr,
70-
*bufptr;
168+
char *buf;
71169
trgm *tptr;
72-
int state = WORDWAIT;
73-
int wl,
74-
len;
170+
int len,
171+
charlen,
172+
bytelen;
173+
char *bword, *eword;
75174

76175
trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
77176
trg->flag = ARRKEY;
@@ -83,7 +182,6 @@ generate_trgm(char *str, int slen)
83182
tptr = GETARR(trg);
84183

85184
buf = palloc(sizeof(char) * (slen + 4));
86-
sptr = str;
87185

88186
if (LPADDING > 0)
89187
{
@@ -92,82 +190,29 @@ generate_trgm(char *str, int slen)
92190
*(buf + 1) = ' ';
93191
}
94192

95-
bufptr = buf + LPADDING;
96-
while (sptr - str < slen)
193+
eword = str;
194+
while( (bword=find_word(eword, slen - (eword-str), &eword, &charlen)) != NULL )
97195
{
98-
if (state == WORDWAIT)
99-
{
100-
if (
101-
#ifdef KEEPONLYALNUM
102-
isalnum((unsigned char) *sptr)
103-
#else
104-
!isspace((unsigned char) *sptr)
105-
#endif
106-
)
107-
{
108-
*bufptr = *sptr; /* start put word in buffer */
109-
bufptr++;
110-
state = INWORD;
111-
if (sptr - str == slen - 1 /* last char */ )
112-
goto gettrg;
113-
}
114-
}
115-
else
116-
{
117-
if (
118-
#ifdef KEEPONLYALNUM
119-
!isalnum((unsigned char) *sptr)
196+
#ifdef IGNORECASE
197+
bword = lowerstr_with_len(bword, eword - bword);
198+
bytelen = strlen(bword);
120199
#else
121-
isspace((unsigned char) *sptr)
200+
bytelen = eword - bword;
122201
#endif
123-
)
124-
{
125-
gettrg:
126-
/* word in buffer, so count trigrams */
127-
*bufptr = ' ';
128-
*(bufptr + 1) = ' ';
129-
wl = bufptr - (buf + LPADDING) - 2 + LPADDING + RPADDING;
130-
if (wl <= 0)
131-
{
132-
bufptr = buf + LPADDING;
133-
state = WORDWAIT;
134-
sptr++;
135-
continue;
136-
}
202+
203+
memcpy(buf + LPADDING, bword, bytelen);
137204

138205
#ifdef IGNORECASE
139-
do
140-
{ /* lower word */
141-
int wwl = bufptr - buf;
142-
143-
bufptr = buf + LPADDING;
144-
while (bufptr - buf < wwl)
145-
{
146-
*bufptr = tolower((unsigned char) *bufptr);
147-
bufptr++;
148-
}
149-
} while (0);
206+
pfree(bword);
150207
#endif
151-
bufptr = buf;
152-
/* set trigrams */
153-
while (bufptr - buf < wl)
154-
{
155-
CPTRGM(tptr, bufptr);
156-
bufptr++;
157-
tptr++;
158-
}
159-
bufptr = buf + LPADDING;
160-
state = WORDWAIT;
161-
}
162-
else
163-
{
164-
*bufptr = *sptr; /* put in buffer */
165-
bufptr++;
166-
if (sptr - str == slen - 1)
167-
goto gettrg;
168-
}
169-
}
170-
sptr++;
208+
buf[LPADDING+bytelen] = ' ';
209+
buf[LPADDING+bytelen+1] = ' ';
210+
211+
/*
212+
* count trigrams
213+
*/
214+
tptr = make_trigrams( tptr, buf, bytelen + LPADDING + RPADDING,
215+
charlen + LPADDING + RPADDING );
171216
}
172217

173218
pfree(buf);
@@ -186,6 +231,19 @@ generate_trgm(char *str, int slen)
186231
return trg;
187232
}
188233

234+
uint32
235+
trgm2int(trgm *ptr)
236+
{
237+
uint32 val = 0;
238+
239+
val |= *( ((unsigned char*)ptr) );
240+
val <<= 8;
241+
val |= *( ((unsigned char*)ptr) + 1 );
242+
val <<= 8;
243+
val |= *( ((unsigned char*)ptr) + 2 );
244+
245+
return val;
246+
}
189247

190248
PG_FUNCTION_INFO_V1(show_trgm);
191249
Datum show_trgm(PG_FUNCTION_ARGS);
@@ -204,10 +262,18 @@ show_trgm(PG_FUNCTION_ARGS)
204262

205263
for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
206264
{
207-
text *item = (text *) palloc(VARHDRSZ + 3);
265+
text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length()*3) );
208266

209-
SET_VARSIZE(item, VARHDRSZ + 3);
210-
CPTRGM(VARDATA(item), ptr);
267+
if ( pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr) )
268+
{
269+
snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
270+
SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
271+
}
272+
else
273+
{
274+
SET_VARSIZE(item, VARHDRSZ + 3);
275+
CPTRGM(VARDATA(item), ptr);
276+
}
211277
d[i] = PointerGetDatum(item);
212278
}
213279

0 commit comments

Comments
 (0)