Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 3ccae48

Browse files
committed
Support indexing of regular-expression searches in contrib/pg_trgm.
This works by extracting trigrams from the given regular expression, in generally the same spirit as the previously-existing support for LIKE searches, though of course the details are far more complicated. Currently, only GIN indexes are supported. We might be able to make it work with GiST indexes later. The implementation includes adding API functions to backend/regex/ to provide a view of the search NFA created from a regular expression. These functions are meant to be generic enough to be supportable in a standalone version of the regex library, should that ever happen. Alexander Korotkov, reviewed by Heikki Linnakangas and Tom Lane
1 parent e60d20a commit 3ccae48

File tree

17 files changed

+2865
-43
lines changed

17 files changed

+2865
-43
lines changed

contrib/pg_trgm/Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# contrib/pg_trgm/Makefile
22

33
MODULE_big = pg_trgm
4-
OBJS = trgm_op.o trgm_gist.o trgm_gin.o
4+
OBJS = trgm_op.o trgm_gist.o trgm_gin.o trgm_regexp.o
55

66
EXTENSION = pg_trgm
7-
DATA = pg_trgm--1.0.sql pg_trgm--unpackaged--1.0.sql
7+
DATA = pg_trgm--1.1.sql pg_trgm--1.0--1.1.sql pg_trgm--unpackaged--1.0.sql
88

99
REGRESS = pg_trgm
1010

contrib/pg_trgm/expected/pg_trgm.out

+138-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ select similarity('---', '####---');
6060
(1 row)
6161

6262
CREATE TABLE test_trgm(t text);
63-
\copy test_trgm from 'data/trgm.data
63+
\copy test_trgm from 'data/trgm.data'
6464
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
6565
t | sml
6666
-------------+----------
@@ -3470,6 +3470,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
34703470
create table test2(t text);
34713471
insert into test2 values ('abcdef');
34723472
insert into test2 values ('quark');
3473+
insert into test2 values (' z foo bar');
34733474
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
34743475
set enable_seqscan=off;
34753476
explain (costs off)
@@ -3521,6 +3522,142 @@ select * from test2 where t ilike 'qua%';
35213522
quark
35223523
(1 row)
35233524

3525+
select * from test2 where t like '%z foo bar%';
3526+
t
3527+
-------------
3528+
z foo bar
3529+
(1 row)
3530+
3531+
select * from test2 where t like ' z foo%';
3532+
t
3533+
-------------
3534+
z foo bar
3535+
(1 row)
3536+
3537+
explain (costs off)
3538+
select * from test2 where t ~ '[abc]{3}';
3539+
QUERY PLAN
3540+
--------------------------------------------
3541+
Bitmap Heap Scan on test2
3542+
Recheck Cond: (t ~ '[abc]{3}'::text)
3543+
-> Bitmap Index Scan on test2_idx_gin
3544+
Index Cond: (t ~ '[abc]{3}'::text)
3545+
(4 rows)
3546+
3547+
explain (costs off)
3548+
select * from test2 where t ~* 'DEF';
3549+
QUERY PLAN
3550+
------------------------------------------
3551+
Bitmap Heap Scan on test2
3552+
Recheck Cond: (t ~* 'DEF'::text)
3553+
-> Bitmap Index Scan on test2_idx_gin
3554+
Index Cond: (t ~* 'DEF'::text)
3555+
(4 rows)
3556+
3557+
select * from test2 where t ~ '[abc]{3}';
3558+
t
3559+
--------
3560+
abcdef
3561+
(1 row)
3562+
3563+
select * from test2 where t ~ 'a[bc]+d';
3564+
t
3565+
--------
3566+
abcdef
3567+
(1 row)
3568+
3569+
select * from test2 where t ~ '(abc)*$';
3570+
t
3571+
-------------
3572+
abcdef
3573+
quark
3574+
z foo bar
3575+
(3 rows)
3576+
3577+
select * from test2 where t ~* 'DEF';
3578+
t
3579+
--------
3580+
abcdef
3581+
(1 row)
3582+
3583+
select * from test2 where t ~ 'dEf';
3584+
t
3585+
---
3586+
(0 rows)
3587+
3588+
select * from test2 where t ~* '^q';
3589+
t
3590+
-------
3591+
quark
3592+
(1 row)
3593+
3594+
select * from test2 where t ~* '[abc]{3}[def]{3}';
3595+
t
3596+
--------
3597+
abcdef
3598+
(1 row)
3599+
3600+
select * from test2 where t ~* 'ab[a-z]{3}';
3601+
t
3602+
--------
3603+
abcdef
3604+
(1 row)
3605+
3606+
select * from test2 where t ~* '(^| )qua';
3607+
t
3608+
-------
3609+
quark
3610+
(1 row)
3611+
3612+
select * from test2 where t ~ 'q.*rk$';
3613+
t
3614+
-------
3615+
quark
3616+
(1 row)
3617+
3618+
select * from test2 where t ~ 'q';
3619+
t
3620+
-------
3621+
quark
3622+
(1 row)
3623+
3624+
select * from test2 where t ~ '[a-z]{3}';
3625+
t
3626+
-------------
3627+
abcdef
3628+
quark
3629+
z foo bar
3630+
(3 rows)
3631+
3632+
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
3633+
t
3634+
---
3635+
(0 rows)
3636+
3637+
select * from test2 where t ~ 'z foo bar';
3638+
t
3639+
-------------
3640+
z foo bar
3641+
(1 row)
3642+
3643+
select * from test2 where t ~ ' z foo bar';
3644+
t
3645+
-------------
3646+
z foo bar
3647+
(1 row)
3648+
3649+
select * from test2 where t ~ ' z foo bar';
3650+
t
3651+
-------------
3652+
z foo bar
3653+
(1 row)
3654+
3655+
select * from test2 where t ~ ' z foo';
3656+
t
3657+
-------------
3658+
z foo bar
3659+
(1 row)
3660+
35243661
drop index test2_idx_gin;
35253662
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
35263663
set enable_seqscan=off;

contrib/pg_trgm/pg_trgm--1.0--1.1.sql

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
/* contrib/pg_trgm/pg_trgm--1.0--1.1.sql */
2+
3+
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
4+
\echo Use "ALTER EXTENSION pg_trgm UPDATE TO '1.1'" to load this file. \quit
5+
6+
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
7+
OPERATOR 5 pg_catalog.~ (text, text),
8+
OPERATOR 6 pg_catalog.~* (text, text);

contrib/pg_trgm/pg_trgm--1.0.sql renamed to contrib/pg_trgm/pg_trgm--1.1.sql

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* contrib/pg_trgm/pg_trgm--1.0.sql */
1+
/* contrib/pg_trgm/pg_trgm--1.1.sql */
22

33
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
44
\echo Use "CREATE EXTENSION pg_trgm" to load this file. \quit
@@ -164,3 +164,9 @@ AS
164164
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
165165
OPERATOR 3 pg_catalog.~~ (text, text),
166166
OPERATOR 4 pg_catalog.~~* (text, text);
167+
168+
-- Add operators that are new in 9.3.
169+
170+
ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
171+
OPERATOR 5 pg_catalog.~ (text, text),
172+
OPERATOR 6 pg_catalog.~* (text, text);

contrib/pg_trgm/pg_trgm.control

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# pg_trgm extension
22
comment = 'text similarity measurement and index searching based on trigrams'
3-
default_version = '1.0'
3+
default_version = '1.1'
44
module_pathname = '$libdir/pg_trgm'
55
relocatable = true

contrib/pg_trgm/sql/pg_trgm.sql

+25-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ select similarity('---', '####---');
1515

1616
CREATE TABLE test_trgm(t text);
1717

18-
\copy test_trgm from 'data/trgm.data
18+
\copy test_trgm from 'data/trgm.data'
1919

2020
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
2121
select t,similarity(t,'gwertyu0988') as sml from test_trgm where t % 'gwertyu0988' order by sml desc, t;
@@ -43,6 +43,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
4343
create table test2(t text);
4444
insert into test2 values ('abcdef');
4545
insert into test2 values ('quark');
46+
insert into test2 values (' z foo bar');
4647
create index test2_idx_gin on test2 using gin (t gin_trgm_ops);
4748
set enable_seqscan=off;
4849
explain (costs off)
@@ -54,6 +55,29 @@ select * from test2 where t like '%bcd%';
5455
select * from test2 where t like E'%\\bcd%';
5556
select * from test2 where t ilike '%BCD%';
5657
select * from test2 where t ilike 'qua%';
58+
select * from test2 where t like '%z foo bar%';
59+
select * from test2 where t like ' z foo%';
60+
explain (costs off)
61+
select * from test2 where t ~ '[abc]{3}';
62+
explain (costs off)
63+
select * from test2 where t ~* 'DEF';
64+
select * from test2 where t ~ '[abc]{3}';
65+
select * from test2 where t ~ 'a[bc]+d';
66+
select * from test2 where t ~ '(abc)*$';
67+
select * from test2 where t ~* 'DEF';
68+
select * from test2 where t ~ 'dEf';
69+
select * from test2 where t ~* '^q';
70+
select * from test2 where t ~* '[abc]{3}[def]{3}';
71+
select * from test2 where t ~* 'ab[a-z]{3}';
72+
select * from test2 where t ~* '(^| )qua';
73+
select * from test2 where t ~ 'q.*rk$';
74+
select * from test2 where t ~ 'q';
75+
select * from test2 where t ~ '[a-z]{3}';
76+
select * from test2 where t ~* '(a{10}|b{10}|c{10}){10}';
77+
select * from test2 where t ~ 'z foo bar';
78+
select * from test2 where t ~ ' z foo bar';
79+
select * from test2 where t ~ ' z foo bar';
80+
select * from test2 where t ~ ' z foo';
5781
drop index test2_idx_gin;
5882
create index test2_idx_gist on test2 using gist (t gist_trgm_ops);
5983
set enable_seqscan=off;

contrib/pg_trgm/trgm.h

+23-12
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,20 @@
77
#include "access/gist.h"
88
#include "access/itup.h"
99
#include "storage/bufpage.h"
10-
#include "utils/builtins.h"
1110

12-
/* options */
11+
/*
12+
* Options ... but note that trgm_regexp.c effectively assumes these values
13+
* of LPADDING and RPADDING.
14+
*/
1315
#define LPADDING 2
1416
#define RPADDING 1
1517
#define KEEPONLYALNUM
1618
/*
1719
* Caution: IGNORECASE macro means that trigrams are case-insensitive.
18-
* If this macro is disabled, the ~~* operator must be removed from the
19-
* operator classes, because we can't handle case-insensitive wildcard search
20-
* with case-sensitive trigrams. Failure to do this will result in "cannot
21-
* handle ~~* with case-sensitive trigrams" errors.
20+
* If this macro is disabled, the ~* and ~~* operators must be removed from
21+
* the operator classes, because we can't handle case-insensitive wildcard
22+
* search with case-sensitive trigrams. Failure to do this will result in
23+
* "cannot handle ~*(~~*) with case-sensitive trigrams" errors.
2224
*/
2325
#define IGNORECASE
2426
#define DIVUNION
@@ -28,6 +30,8 @@
2830
#define DistanceStrategyNumber 2
2931
#define LikeStrategyNumber 3
3032
#define ILikeStrategyNumber 4
33+
#define RegExpStrategyNumber 5
34+
#define RegExpICaseStrategyNumber 6
3135

3236

3337
typedef char trgm[3];
@@ -42,11 +46,11 @@ typedef char trgm[3];
4246
*(((char*)(a))+2) = *(((char*)(b))+2); \
4347
} while(0);
4448

45-
uint32 trgm2int(trgm *ptr);
46-
4749
#ifdef KEEPONLYALNUM
50+
#define ISWORDCHR(c) (t_isalpha(c) || t_isdigit(c))
4851
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
4952
#else
53+
#define ISWORDCHR(c) (!t_isspace(c))
5054
#define ISPRINTABLECHAR(a) ( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
5155
#endif
5256
#define ISPRINTABLETRGM(t) ( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
@@ -99,11 +103,18 @@ typedef char *BITVECP;
99103
#define GETARR(x) ( (trgm*)( (char*)x+TRGMHDRSIZE ) )
100104
#define ARRNELEM(x) ( ( VARSIZE(x) - TRGMHDRSIZE )/sizeof(trgm) )
101105

106+
typedef struct TrgmPackedGraph TrgmPackedGraph;
107+
102108
extern float4 trgm_limit;
103109

104-
TRGM *generate_trgm(char *str, int slen);
105-
TRGM *generate_wildcard_trgm(const char *str, int slen);
106-
float4 cnt_sml(TRGM *trg1, TRGM *trg2);
107-
bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
110+
extern uint32 trgm2int(trgm *ptr);
111+
extern void compact_trigram(trgm *tptr, char *str, int bytelen);
112+
extern TRGM *generate_trgm(char *str, int slen);
113+
extern TRGM *generate_wildcard_trgm(const char *str, int slen);
114+
extern float4 cnt_sml(TRGM *trg1, TRGM *trg2);
115+
extern bool trgm_contained_by(TRGM *trg1, TRGM *trg2);
116+
extern TRGM *createTrgmNFA(text *text_re, TrgmPackedGraph **graph,
117+
Oid collation);
118+
extern bool trigramsMatchGraph(TrgmPackedGraph *graph, bool *check);
108119

109120
#endif /* __TRGM_H__ */

0 commit comments

Comments
 (0)