Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 6e2f3ae

Browse files
committed
Support LIKE and ILIKE index searches via contrib/pg_trgm indexes.
Unlike Btree-based LIKE optimization, this works for non-left-anchored search patterns. The effectiveness of the search depends on how many trigrams can be extracted from the pattern. (The worst case, with no trigrams, degrades to a full-table scan, so this isn't a panacea. But it can be very useful.) Alexander Korotkov, reviewed by Jan Urbanski
1 parent 6238473 commit 6e2f3ae

File tree

9 files changed

+640
-50
lines changed

9 files changed

+640
-50
lines changed

contrib/pg_trgm/expected/pg_trgm.out

+90
Original file line numberDiff line numberDiff line change
@@ -3467,3 +3467,93 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
34673467
qwertyu0988 | 0.333333
34683468
(1 row)
34693469

3470+
create table test2(t text);
3471+
insert into test2 values ('abcdef');
3472+
insert into test2 values ('quark');
3473+
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
3474+
set enable_seqscan=off;
3475+
explain (costs off)
3476+
select * from test2 where t like '%BCD%';
3477+
QUERY PLAN
3478+
------------------------------------------
3479+
Bitmap Heap Scan on test2
3480+
Recheck Cond: (t ~~ '%BCD%'::text)
3481+
-> Bitmap Index Scan on test2_idx_gin
3482+
Index Cond: (t ~~ '%BCD%'::text)
3483+
(4 rows)
3484+
3485+
explain (costs off)
3486+
select * from test2 where t ilike '%BCD%';
3487+
QUERY PLAN
3488+
-------------------------------------------
3489+
Bitmap Heap Scan on test2
3490+
Recheck Cond: (t ~~* '%BCD%'::text)
3491+
-> Bitmap Index Scan on test2_idx_gin
3492+
Index Cond: (t ~~* '%BCD%'::text)
3493+
(4 rows)
3494+
3495+
select * from test2 where t like '%BCD%';
3496+
t
3497+
---
3498+
(0 rows)
3499+
3500+
select * from test2 where t like '%bcd%';
3501+
t
3502+
--------
3503+
abcdef
3504+
(1 row)
3505+
3506+
select * from test2 where t ilike '%BCD%';
3507+
t
3508+
--------
3509+
abcdef
3510+
(1 row)
3511+
3512+
select * from test2 where t ilike 'qua%';
3513+
t
3514+
-------
3515+
quark
3516+
(1 row)
3517+
3518+
drop index test2_idx_gin;
3519+
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
3520+
set enable_seqscan=off;
3521+
explain (costs off)
3522+
select * from test2 where t like '%BCD%';
3523+
QUERY PLAN
3524+
------------------------------------------
3525+
Index Scan using test2_idx_gist on test2
3526+
Index Cond: (t ~~ '%BCD%'::text)
3527+
(2 rows)
3528+
3529+
explain (costs off)
3530+
select * from test2 where t ilike '%BCD%';
3531+
QUERY PLAN
3532+
------------------------------------------
3533+
Index Scan using test2_idx_gist on test2
3534+
Index Cond: (t ~~* '%BCD%'::text)
3535+
(2 rows)
3536+
3537+
select * from test2 where t like '%BCD%';
3538+
t
3539+
---
3540+
(0 rows)
3541+
3542+
select * from test2 where t like '%bcd%';
3543+
t
3544+
--------
3545+
abcdef
3546+
(1 row)
3547+
3548+
select * from test2 where t ilike '%BCD%';
3549+
t
3550+
--------
3551+
abcdef
3552+
(1 row)
3553+
3554+
select * from test2 where t ilike 'qua%';
3555+
t
3556+
-------
3557+
quark
3558+
(1 row)
3559+

contrib/pg_trgm/pg_trgm.sql.in

+10-6
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ FOR TYPE text USING gist
113113
AS
114114
OPERATOR 1 % (text, text),
115115
OPERATOR 2 <-> (text, text) FOR ORDER BY pg_catalog.float_ops,
116+
OPERATOR 3 pg_catalog.~~ (text, text),
117+
OPERATOR 4 pg_catalog.~~* (text, text),
116118
FUNCTION 1 gtrgm_consistent (internal, text, int, oid, internal),
117119
FUNCTION 2 gtrgm_union (bytea, internal),
118120
FUNCTION 3 gtrgm_compress (internal),
@@ -124,17 +126,17 @@ AS
124126
STORAGE gtrgm;
125127

126128
-- support functions for gin
127-
CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal)
129+
CREATE OR REPLACE FUNCTION gin_extract_value_trgm(text, internal)
128130
RETURNS internal
129131
AS 'MODULE_PATHNAME'
130132
LANGUAGE C IMMUTABLE STRICT;
131133

132-
CREATE OR REPLACE FUNCTION gin_extract_trgm(text, internal, int2, internal, internal)
134+
CREATE OR REPLACE FUNCTION gin_extract_query_trgm(text, internal, int2, internal, internal, internal, internal)
133135
RETURNS internal
134136
AS 'MODULE_PATHNAME'
135137
LANGUAGE C IMMUTABLE STRICT;
136138

137-
CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal)
139+
CREATE OR REPLACE FUNCTION gin_trgm_consistent(internal, int2, text, int4, internal, internal, internal, internal)
138140
RETURNS bool
139141
AS 'MODULE_PATHNAME'
140142
LANGUAGE C IMMUTABLE STRICT;
@@ -144,8 +146,10 @@ CREATE OPERATOR CLASS gin_trgm_ops
144146
FOR TYPE text USING gin
145147
AS
146148
OPERATOR 1 % (text, text),
149+
OPERATOR 3 pg_catalog.~~ (text, text),
150+
OPERATOR 4 pg_catalog.~~* (text, text),
147151
FUNCTION 1 btint4cmp (int4, int4),
148-
FUNCTION 2 gin_extract_trgm (text, internal),
149-
FUNCTION 3 gin_extract_trgm (text, internal, int2, internal, internal),
150-
FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal),
152+
FUNCTION 2 gin_extract_value_trgm (text, internal),
153+
FUNCTION 3 gin_extract_query_trgm (text, internal, int2, internal, internal, internal, internal),
154+
FUNCTION 4 gin_trgm_consistent (internal, int2, text, int4, internal, internal, internal, internal),
151155
STORAGE int4;

contrib/pg_trgm/sql/pg_trgm.sql

+25
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,28 @@ set enable_seqscan=off;
4545
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
4646
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
4747
select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu1988' order by sml desc, t;
48+
49+
create table test2(t text);
50+
insert into test2 values ('abcdef');
51+
insert into test2 values ('quark');
52+
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
53+
set enable_seqscan=off;
54+
explain (costs off)
55+
select * from test2 where t like '%BCD%';
56+
explain (costs off)
57+
select * from test2 where t ilike '%BCD%';
58+
select * from test2 where t like '%BCD%';
59+
select * from test2 where t like '%bcd%';
60+
select * from test2 where t ilike '%BCD%';
61+
select * from test2 where t ilike 'qua%';
62+
drop index test2_idx_gin;
63+
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
64+
set enable_seqscan=off;
65+
explain (costs off)
66+
select * from test2 where t like '%BCD%';
67+
explain (costs off)
68+
select * from test2 where t ilike '%BCD%';
69+
select * from test2 where t like '%BCD%';
70+
select * from test2 where t like '%bcd%';
71+
select * from test2 where t ilike '%BCD%';
72+
select * from test2 where t ilike 'qua%';

contrib/pg_trgm/trgm.h

+18-4
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,21 @@
1313
#define LPADDING 2
1414
#define RPADDING 1
1515
#define KEEPONLYALNUM
16+
/*
17+
* Caution: IGNORECASE macro means that trigrams are case-insensitive.
18+
* If this macro is disabled, the ~~* operator must be removed from the
19+
* operator classes, because we can't handle case-insensitive wildcard search
20+
* with case-sensitive trigrams. Failure to do this will result in "cannot
21+
* handle ~~* with case-sensitive trigrams" errors.
22+
*/
1623
#define IGNORECASE
1724
#define DIVUNION
1825

1926
/* operator strategy numbers */
20-
#define SimilarityStrategyNumber 1
21-
#define DistanceStrategyNumber 2
27+
#define SimilarityStrategyNumber 1
28+
#define DistanceStrategyNumber 2
29+
#define LikeStrategyNumber 3
30+
#define ILikeStrategyNumber 4
2231

2332

2433
typedef char trgm[3];
@@ -40,7 +49,10 @@ uint32 trgm2int(trgm *ptr);
4049
#else
4150
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
4251
#endif
43-
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
52+
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
53+
54+
#define ISESCAPECHAR(x) (*(x) == '\\') /* Wildcard escape character */
55+
#define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%') /* Wildcard meta-character */
4456

4557
typedef struct
4658
{
@@ -65,7 +77,7 @@ typedef char *BITVECP;
6577
for(i=0;i<SIGLEN;i++)
6678

6779
#define GETBYTE(x,i) ( *( (BITVECP)(x) + (int)( (i) / BITBYTE ) ) )
68-
#define GETBITBYTE(x,i) ( ((char)(x)) >> i & 0x01 )
80+
#define GETBITBYTE(x,i) ( (((char)(x)) >> (i)) & 0x01 )
6981
#define CLRBIT(x,i) GETBYTE(x,i) &= ~( 0x01 << ( (i) % BITBYTE ) )
7082
#define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITBYTE ) )
7183
#define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITBYTE )) & 0x01 )
@@ -89,6 +101,8 @@ typedef char *BITVECP;
89101
extern float4 trgm_limit;
90102

91103
TRGM *generate_trgm(char *str, int slen);
104+
TRGM *generate_wildcard_trgm(const char *str, int slen);
92105
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
106+
bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
93107

94108
#endif /* __TRGM_H__ */

contrib/pg_trgm/trgm_gin.c

+120-12
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "access/gin.h"
99
#include "access/itup.h"
10+
#include "access/skey.h"
1011
#include "access/tuptoaster.h"
1112
#include "storage/bufpage.h"
1213
#include "utils/array.h"
@@ -16,14 +17,31 @@
1617
PG_FUNCTION_INFO_V1(gin_extract_trgm);
1718
Datum gin_extract_trgm(PG_FUNCTION_ARGS);
1819

20+
PG_FUNCTION_INFO_V1(gin_extract_value_trgm);
21+
Datum gin_extract_value_trgm(PG_FUNCTION_ARGS);
22+
23+
PG_FUNCTION_INFO_V1(gin_extract_query_trgm);
24+
Datum gin_extract_query_trgm(PG_FUNCTION_ARGS);
25+
1926
PG_FUNCTION_INFO_V1(gin_trgm_consistent);
2027
Datum gin_trgm_consistent(PG_FUNCTION_ARGS);
2128

2229
/*
23-
* This function is used as both extractValue and extractQuery
30+
* This function can only be called if a pre-9.1 version of the GIN operator
31+
* class definition is present in the catalogs (probably as a consequence
32+
* of upgrade-in-place). Complain.
2433
*/
2534
Datum
2635
gin_extract_trgm(PG_FUNCTION_ARGS)
36+
{
37+
ereport(ERROR,
38+
(errmsg("GIN operator class for pg_trgm is out of date"),
39+
errhint("Please drop and re-create the pg_trgm catalog entries.")));
40+
PG_RETURN_NULL();
41+
}
42+
43+
Datum
44+
gin_extract_value_trgm(PG_FUNCTION_ARGS)
2745
{
2846
text *val = (text *) PG_GETARG_TEXT_P(0);
2947
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
@@ -57,34 +75,124 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
5775
PG_RETURN_POINTER(entries);
5876
}
5977

78+
Datum
79+
gin_extract_query_trgm(PG_FUNCTION_ARGS)
80+
{
81+
text *val = (text *) PG_GETARG_TEXT_P(0);
82+
int32 *nentries = (int32 *) PG_GETARG_POINTER(1);
83+
StrategyNumber strategy = PG_GETARG_UINT16(2);
84+
/* bool **pmatch = (bool **) PG_GETARG_POINTER(3); */
85+
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
86+
/* bool **nullFlags = (bool **) PG_GETARG_POINTER(5); */
87+
int32 *searchMode = (int32 *) PG_GETARG_POINTER(6);
88+
Datum *entries = NULL;
89+
TRGM *trg;
90+
int32 trglen;
91+
trgm *ptr;
92+
int32 i;
93+
94+
switch (strategy)
95+
{
96+
case SimilarityStrategyNumber:
97+
trg = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
98+
break;
99+
case ILikeStrategyNumber:
100+
#ifndef IGNORECASE
101+
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
102+
#endif
103+
/* FALL THRU */
104+
case LikeStrategyNumber:
105+
/*
106+
* For wildcard search we extract all the trigrams that every
107+
* potentially-matching string must include.
108+
*/
109+
trg = generate_wildcard_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
110+
break;
111+
default:
112+
elog(ERROR, "unrecognized strategy number: %d", strategy);
113+
trg = NULL; /* keep compiler quiet */
114+
break;
115+
}
116+
117+
trglen = ARRNELEM(trg);
118+
*nentries = trglen;
119+
120+
if (trglen > 0)
121+
{
122+
entries = (Datum *) palloc(sizeof(Datum) * trglen);
123+
ptr = GETARR(trg);
124+
for (i = 0; i < trglen; i++)
125+
{
126+
int32 item = trgm2int(ptr);
127+
128+
entries[i] = Int32GetDatum(item);
129+
ptr++;
130+
}
131+
}
132+
133+
/*
134+
* If no trigram was extracted then we have to scan all the index.
135+
*/
136+
if (trglen == 0)
137+
*searchMode = GIN_SEARCH_MODE_ALL;
138+
139+
PG_RETURN_POINTER(entries);
140+
}
141+
60142
Datum
61143
gin_trgm_consistent(PG_FUNCTION_ARGS)
62144
{
63145
bool *check = (bool *) PG_GETARG_POINTER(0);
64-
/* StrategyNumber strategy = PG_GETARG_UINT16(1); */
146+
StrategyNumber strategy = PG_GETARG_UINT16(1);
65147
/* text *query = PG_GETARG_TEXT_P(2); */
66148
int32 nkeys = PG_GETARG_INT32(3);
67149
/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */
68150
bool *recheck = (bool *) PG_GETARG_POINTER(5);
69-
bool res = FALSE;
151+
bool res;
70152
int32 i,
71-
ntrue = 0;
153+
ntrue;
72154

73155
/* All cases served by this function are inexact */
74156
*recheck = true;
75157

76-
/* Count the matches */
77-
for (i = 0; i < nkeys; i++)
158+
switch (strategy)
78159
{
79-
if (check[i])
80-
ntrue++;
81-
}
82-
160+
case SimilarityStrategyNumber:
161+
/* Count the matches */
162+
ntrue = 0;
163+
for (i = 0; i < nkeys; i++)
164+
{
165+
if (check[i])
166+
ntrue++;
167+
}
83168
#ifdef DIVUNION
84-
res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
169+
res = (nkeys == ntrue) ? true : ((((((float4) ntrue) / ((float4) (nkeys - ntrue)))) >= trgm_limit) ? true : false);
85170
#else
86-
res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
171+
res = (nkeys == 0) ? false : ((((((float4) ntrue) / ((float4) nkeys))) >= trgm_limit) ? true : false);
87172
#endif
173+
break;
174+
case ILikeStrategyNumber:
175+
#ifndef IGNORECASE
176+
elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
177+
#endif
178+
/* FALL THRU */
179+
case LikeStrategyNumber:
180+
/* Check if all extracted trigrams are presented. */
181+
res = true;
182+
for (i = 0; i < nkeys; i++)
183+
{
184+
if (!check[i])
185+
{
186+
res = false;
187+
break;
188+
}
189+
}
190+
break;
191+
default:
192+
elog(ERROR, "unrecognized strategy number: %d", strategy);
193+
res = false; /* keep compiler quiet */
194+
break;
195+
}
88196

89197
PG_RETURN_BOOL(res);
90198
}

0 commit comments

Comments
 (0)