Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 6301445

Browse files
committed
Add 'contrib/pg_tsparser/' from commit '833e5b2f571cef233f31fd45a65f901d8af8184f'
git-subtree-dir: contrib/pg_tsparser git-subtree-mainline: 76ec0df git-subtree-split: 833e5b2
2 parents 76ec0df + 833e5b2 commit 6301445

File tree

7 files changed

+3027
-0
lines changed

7 files changed

+3027
-0
lines changed

contrib/pg_tsparser/Makefile

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# pg_tsparser/Makefile
2+
3+
MODULE_big = pg_tsparser
4+
OBJS = tsparser.o $(WIN32RES)
5+
6+
EXTENSION = pg_tsparser
7+
DATA = pg_tsparser--1.0.sql
8+
PGFILEDESC = "pg_tsparser - parser for text search"
9+
10+
REGRESS = pg_tsparser
11+
12+
ifdef USE_PGXS
13+
PG_CONFIG = pg_config
14+
PGXS := $(shell $(PG_CONFIG) --pgxs)
15+
include $(PGXS)
16+
else
17+
subdir = contrib/pg_tsparser
18+
top_builddir = ../..
19+
include $(top_builddir)/src/Makefile.global
20+
include $(top_srcdir)/contrib/contrib-global.mk
21+
endif

contrib/pg_tsparser/README.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# pg_tsparser - parser for text search
2+
3+
## Introduction
4+
5+
The **pg_tsparser** module is the modified default text search parser from
6+
PostgreSQL 9.6. The differences are:
7+
* **tsparser** gives unbroken words by underscore character
8+
* **tsparser** gives unbroken words with numbers and letters by hyphen character
9+
10+
For example:
11+
12+
```sql
13+
SELECT to_tsvector('english', 'pg_trgm') as def_parser,
14+
to_tsvector('english_ts', 'pg_trgm') as new_parser;
15+
def_parser | new_parser
16+
-----------------+-----------------------------
17+
'pg':1 'trgm':2 | 'pg':2 'pg_trgm':1 'trgm':3
18+
(1 row)
19+
20+
SELECT to_tsvector('english', '123-abc') as def_parser,
21+
to_tsvector('english_ts', '123-abc') as new_parser;
22+
def_parser | new_parser
23+
-----------------+-----------------------------
24+
'123':1 'abc':2 | '123':2 '123-abc':1 'abc':3
25+
(1 row)
26+
27+
SELECT to_tsvector('english', 'rel-3.2-A') as def_parser,
28+
to_tsvector('english_ts', 'rel-3.2-A') as new_parser;
29+
def_parser | new_parser
30+
------------------+-------------------------------
31+
'-3.2':2 'rel':1 | '3.2':3 'rel':2 'rel-3.2-a':1
32+
(1 row)
33+
```
34+
35+
## License
36+
37+
This module available under the same license as
38+
[PostgreSQL](http://www.postgresql.org/about/licence/).
39+
40+
## Installation
41+
42+
Typical installation procedure may look like this:
43+
44+
$ cd pg_tsparser
45+
$ sudo make USE_PGXS=1 install
46+
$ make USE_PGXS=1 installcheck
47+
$ psql DB -c "CREATE EXTENSION pg_tsparser;"
48+
49+
After this you can create your own text search configuration:
50+
51+
```sql
52+
CREATE TEXT SEARCH CONFIGURATION english_ts (
53+
PARSER = tsparser
54+
);
55+
56+
COMMENT ON TEXT SEARCH CONFIGURATION english_ts IS 'text search configuration for english language';
57+
58+
ALTER TEXT SEARCH CONFIGURATION english_ts
59+
ADD MAPPING FOR email, file, float, host, hword_numpart, int,
60+
numhword, numword, sfloat, uint, url, url_path, version
61+
WITH simple;
62+
63+
ALTER TEXT SEARCH CONFIGURATION english_ts
64+
ADD MAPPING FOR asciiword, asciihword, hword_asciipart,
65+
word, hword, hword_part
66+
WITH english_stem;
67+
```
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
CREATE EXTENSION pg_tsparser;
2+
SELECT * FROM ts_token_type('tsparser');
3+
tokid | alias | description
4+
-------+-----------------+------------------------------------------
5+
1 | asciiword | Word, all ASCII
6+
2 | word | Word, all letters
7+
3 | numword | Word, letters and digits
8+
4 | email | Email address
9+
5 | url | URL
10+
6 | host | Host
11+
7 | sfloat | Scientific notation
12+
8 | version | Version number
13+
9 | hword_numpart | Hyphenated word part, letters and digits
14+
10 | hword_part | Hyphenated word part, all letters
15+
11 | hword_asciipart | Hyphenated word part, all ASCII
16+
12 | blank | Space symbols
17+
13 | tag | XML tag
18+
14 | protocol | Protocol head
19+
15 | numhword | Hyphenated word, letters and digits
20+
16 | asciihword | Hyphenated word, all ASCII
21+
17 | hword | Hyphenated word, all letters
22+
18 | url_path | URL path
23+
19 | file | File or path name
24+
20 | float | Decimal notation
25+
21 | int | Signed integer
26+
22 | uint | Unsigned integer
27+
23 | entity | XML entity
28+
(23 rows)
29+
30+
SELECT * FROM ts_parse('tsparser', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net teodor@123-stack.net 123_teodor@stack.net 123-teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
31+
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
32+
<i <b> wow < jqw <> qwerty');
33+
tokid | token
34+
-------+--------------------------------------
35+
22 | 345
36+
12 |
37+
1 | qwe
38+
12 | @
39+
19 | efd.r
40+
12 | '
41+
14 | http://
42+
6 | www.com
43+
12 | /
44+
14 | http://
45+
5 | aew.werc.ewr/?ad=qwe&dw
46+
6 | aew.werc.ewr
47+
18 | /?ad=qwe&dw
48+
12 |
49+
5 | 1aew.werc.ewr/?ad=qwe&dw
50+
6 | 1aew.werc.ewr
51+
18 | /?ad=qwe&dw
52+
12 |
53+
6 | 2aew.werc.ewr
54+
12 |
55+
14 | http://
56+
5 | 3aew.werc.ewr/?ad=qwe&dw
57+
6 | 3aew.werc.ewr
58+
18 | /?ad=qwe&dw
59+
12 |
60+
14 | http://
61+
6 | 4aew.werc.ewr
62+
12 |
63+
14 | http://
64+
5 | 5aew.werc.ewr:8100/?
65+
6 | 5aew.werc.ewr:8100
66+
18 | /?
67+
12 |
68+
1 | ad
69+
12 | =
70+
1 | qwe
71+
12 | &
72+
1 | dw
73+
12 |
74+
5 | 6aew.werc.ewr:8100/?ad=qwe&dw
75+
6 | 6aew.werc.ewr:8100
76+
18 | /?ad=qwe&dw
77+
12 |
78+
5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
79+
6 | 7aew.werc.ewr:8100
80+
18 | /?ad=qwe&dw=%20%32
81+
12 |
82+
7 | +4.0e-10
83+
12 |
84+
1 | qwe
85+
12 |
86+
1 | qwe
87+
12 |
88+
1 | qwqwe
89+
12 |
90+
20 | 234.435
91+
12 |
92+
22 | 455
93+
12 |
94+
20 | 5.005
95+
12 |
96+
4 | teodor@stack.net
97+
12 |
98+
4 | teodor@123-stack.net
99+
12 |
100+
4 | 123_teodor@stack.net
101+
12 |
102+
4 | 123-teodor@stack.net
103+
12 |
104+
16 | qwe-wer
105+
11 | qwe
106+
12 | -
107+
11 | wer
108+
12 |
109+
1 | asdf
110+
12 |
111+
13 | <fr>
112+
1 | qwer
113+
12 |
114+
1 | jf
115+
12 |
116+
1 | sdjk
117+
12 | <
118+
1 | we
119+
12 |
120+
1 | hjwer
121+
12 |
122+
13 | <werrwe>
123+
12 |
124+
3 | ewr1
125+
12 | >
126+
3 | ewri2
127+
12 |
128+
13 | <a href="qwe<qwe>">
129+
12 | +
130+
|
131+
19 | /usr/local/fff
132+
12 |
133+
19 | /awdf/dwqe/4325
134+
12 |
135+
19 | rewt/ewr
136+
12 |
137+
1 | wefjn
138+
12 |
139+
19 | /wqe-324/ewr
140+
12 |
141+
19 | gist.h
142+
12 |
143+
19 | gist.h.c
144+
12 |
145+
19 | gist.c
146+
12 | .
147+
1 | readline
148+
12 |
149+
20 | 4.2
150+
12 |
151+
20 | 4.2
152+
12 | .
153+
20 | 4.2
154+
12 | ,
155+
15 | readline-4.2
156+
11 | readline
157+
12 | -
158+
9 | 4.2
159+
12 |
160+
15 | readline-4.2
161+
11 | readline
162+
12 | -
163+
9 | 4.2.
164+
12 |
165+
22 | 234
166+
12 | +
167+
|
168+
12 | <
169+
1 | i
170+
12 |
171+
13 | <b>
172+
12 |
173+
1 | wow
174+
12 |
175+
12 | <
176+
1 | jqw
177+
12 |
178+
12 | <>
179+
1 | qwerty
180+
(143 rows)
181+
182+
-- Test text search configuration with parser
183+
CREATE TEXT SEARCH CONFIGURATION english_ts (
184+
PARSER = tsparser
185+
);
186+
ALTER TEXT SEARCH CONFIGURATION english_ts
187+
ADD MAPPING FOR email, file, float, host, hword_numpart, int,
188+
numhword, numword, sfloat, uint, url, url_path, version
189+
WITH simple;
190+
ALTER TEXT SEARCH CONFIGURATION english_ts
191+
ADD MAPPING FOR asciiword, asciihword, hword_asciipart,
192+
word, hword, hword_part
193+
WITH english_stem;
194+
SELECT to_tsvector('english_ts', 'pg_trgm');
195+
to_tsvector
196+
-----------------------------
197+
'pg':2 'pg_trgm':1 'trgm':3
198+
(1 row)
199+
200+
SELECT to_tsvector('english_ts', '12_abc');
201+
to_tsvector
202+
---------------------------
203+
'12':2 '12_abc':1 'abc':3
204+
(1 row)
205+
206+
SELECT to_tsvector('english_ts', '12-abc');
207+
to_tsvector
208+
---------------------------
209+
'12':2 '12-abc':1 'abc':3
210+
(1 row)
211+
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/* pg_tsparser/pg_tsparser--1.0.sql */
2+
3+
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
4+
\echo Use "CREATE EXTENSION pg_tsparser" to load this file. \quit
5+
6+
CREATE OR REPLACE FUNCTION tsparser_start(internal, int4)
7+
RETURNS internal
8+
AS 'MODULE_PATHNAME'
9+
LANGUAGE C STRICT;
10+
11+
CREATE OR REPLACE FUNCTION tsparser_nexttoken(internal, internal, internal)
12+
RETURNS internal
13+
AS 'MODULE_PATHNAME'
14+
LANGUAGE C STRICT;
15+
16+
CREATE OR REPLACE FUNCTION tsparser_end(internal)
17+
RETURNS void
18+
AS 'MODULE_PATHNAME'
19+
LANGUAGE C STRICT;
20+
21+
CREATE OR REPLACE FUNCTION tsparser_lextype(internal)
22+
RETURNS internal
23+
AS 'MODULE_PATHNAME'
24+
LANGUAGE C STRICT;
25+
26+
CREATE OR REPLACE FUNCTION tsparser_headline(internal, internal, tsquery)
27+
RETURNS internal
28+
AS 'MODULE_PATHNAME'
29+
LANGUAGE C STRICT;
30+
31+
CREATE TEXT SEARCH PARSER tsparser (
32+
START = tsparser_start,
33+
GETTOKEN = tsparser_nexttoken,
34+
END = tsparser_end,
35+
HEADLINE = tsparser_headline,
36+
LEXTYPES = tsparser_lextype
37+
);
38+
39+
COMMENT ON TEXT SEARCH PARSER tsparser IS 'parser for text search';
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# pg_tsparser extension
2+
comment = 'parser for text search'
3+
default_version = '1.0'
4+
module_pathname = '$libdir/pg_tsparser'
5+
relocatable = true
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
CREATE EXTENSION pg_tsparser;
2+
3+
SELECT * FROM ts_token_type('tsparser');
4+
5+
SELECT * FROM ts_parse('tsparser', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net teodor@123-stack.net 123_teodor@stack.net 123-teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
6+
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
7+
<i <b> wow < jqw <> qwerty');
8+
9+
-- Test text search configuration with parser
10+
CREATE TEXT SEARCH CONFIGURATION english_ts (
11+
PARSER = tsparser
12+
);
13+
14+
ALTER TEXT SEARCH CONFIGURATION english_ts
15+
ADD MAPPING FOR email, file, float, host, hword_numpart, int,
16+
numhword, numword, sfloat, uint, url, url_path, version
17+
WITH simple;
18+
19+
ALTER TEXT SEARCH CONFIGURATION english_ts
20+
ADD MAPPING FOR asciiword, asciihword, hword_asciipart,
21+
word, hword, hword_part
22+
WITH english_stem;
23+
24+
SELECT to_tsvector('english_ts', 'pg_trgm');
25+
SELECT to_tsvector('english_ts', '12_abc');
26+
SELECT to_tsvector('english_ts', '12-abc');

0 commit comments

Comments
 (0)