Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit dbaec70

Browse files
committed
Rename and slightly redefine the default text search parser's "word"
categories, as per discussion. asciiword (formerly lword) is still ASCII-letters-only, and numword (formerly word) is still the most general mixed-alpha-and-digits case. But word (formerly nlword) is now any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as before. This is no worse than before for parsing mixed Russian/English text, which seems to have been the design center for the original coding; and it should simplify matters for parsing most European languages. In particular it will not be necessary for any language to accept strings containing digits as being regular "words". The hyphenated-word categories are adjusted similarly.
1 parent 344d0ca commit dbaec70

File tree

10 files changed

+466
-449
lines changed

10 files changed

+466
-449
lines changed

doc/src/sgml/func.sgml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.403 2007/10/22 20:13:37 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.404 2007/10/23 20:46:11 tgl Exp $ -->
22

33
<chapter id="functions">
44
<title>Functions and Operators</title>
@@ -7861,7 +7861,7 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
78617861
<entry><type>setof record</type></entry>
78627862
<entry>test a configuration</entry>
78637863
<entry><literal>ts_debug('english', 'The Brightest supernovaes')</literal></entry>
7864-
<entry><literal>(lword,"Latin word",The,{english_stem},english_stem,{}) ...</literal></entry>
7864+
<entry><literal>(asciiword,"Word, all ASCII",The,{english_stem},english_stem,{}) ...</literal></entry>
78657865
</row>
78667866
<row>
78677867
<entry><literal><function>ts_lexize</function>(<replaceable class="PARAMETER">dict</replaceable> <type>regdictionary</>, <replaceable class="PARAMETER">token</replaceable> <type>text</>)</literal></entry>
@@ -7889,14 +7889,14 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
78897889
<entry><type>setof record</type></entry>
78907890
<entry>get token types defined by parser</entry>
78917891
<entry><literal>ts_token_type('default')</literal></entry>
7892-
<entry><literal>(1,lword,"Latin word") ...</literal></entry>
7892+
<entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
78937893
</row>
78947894
<row>
78957895
<entry><literal><function>ts_token_type</function>(<replaceable class="PARAMETER">parser_oid</> <type>oid</>, OUT <replaceable class="PARAMETER">tokid</> <type>integer</>, OUT <replaceable class="PARAMETER">alias</> <type>text</>, OUT <replaceable class="PARAMETER">description</> <type>text</>)</literal></entry>
78967896
<entry><type>setof record</type></entry>
78977897
<entry>get token types defined by parser</entry>
78987898
<entry><literal>ts_token_type(3722)</literal></entry>
7899-
<entry><literal>(1,lword,"Latin word") ...</literal></entry>
7899+
<entry><literal>(1,asciiword,"Word, all ASCII") ...</literal></entry>
79007900
</row>
79017901
<row>
79027902
<entry><literal><function>ts_stat</function>(<replaceable class="PARAMETER">sqlquery</replaceable> <type>text</>, <optional> <replaceable class="PARAMETER">weights</replaceable> <type>text</>, </optional> OUT <replaceable class="PARAMETER">word</replaceable> <type>text</>, OUT <replaceable class="PARAMETER">ndoc</replaceable> <type>integer</>, OUT <replaceable class="PARAMETER">nentry</replaceable> <type>integer</>)</literal></entry>

doc/src/sgml/textsearch.sgml

Lines changed: 189 additions & 176 deletions
Large diffs are not rendered by default.

src/backend/snowball/Makefile

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#
33
# Makefile for src/backend/snowball
44
#
5-
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $
5+
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $
66
#
77
#-------------------------------------------------------------------------
88

@@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \
4646
stem_UTF_8_swedish.o \
4747
stem_UTF_8_turkish.o
4848

49-
# second column is name of latin dictionary, if different
50-
# Note order dependency: use of some other language as latin dictionary
49+
# first column is language name and also name of dictionary for not-all-ASCII
50+
# words, second is name of dictionary for all-ASCII words
51+
# Note order dependency: use of some other language as ASCII dictionary
5152
# must come after creation of that language
5253
LANGUAGES= \
5354
danish danish \
@@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes)
9596
while [ "$$#" -gt 0 ] ; \
9697
do \
9798
lang=$$1; shift; \
98-
nonlatdictname=$$lang; \
99-
latdictname=$$1; shift; \
99+
nonascdictname=$$lang; \
100+
ascdictname=$$1; shift; \
100101
if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \
101102
stop=", StopWords=$${lang}" ; \
102103
else \
@@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes)
106107
sed -e "s#_LANGNAME_#$$lang#g" | \
107108
sed -e "s#_DICTNAME_#$${lang}_stem#g" | \
108109
sed -e "s#_CFGNAME_#$$lang#g" | \
109-
sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \
110-
sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \
110+
sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \
111+
sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \
111112
sed -e "s#_STOPWORDS_#$$stop#g" ; \
112113
done >> $@
113114
else

src/backend/snowball/snowball.sql.in

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$
1+
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$
22

33
-- text search configuration for _LANGNAME_ language
44
CREATE TEXT SEARCH DICTIONARY _DICTNAME_
@@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_
1212
COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language';
1313

1414
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
15-
FOR email, url, host, sfloat, version, uri, file, float, int, uint
15+
FOR email, url, host, sfloat, version, uri, file, float, int, uint,
16+
numword, hword_numpart, numhword
1617
WITH simple;
1718

1819
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
19-
FOR lhword, lpart_hword, lword
20-
WITH _LATDICTNAME_;
20+
FOR asciiword, hword_asciipart, asciihword
21+
WITH _ASCDICTNAME_;
2122

2223
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
23-
FOR hword, nlhword, nlpart_hword, nlword, word, part_hword
24-
WITH _NONLATDICTNAME_;
24+
FOR word, hword_part, hword
25+
WITH _NONASCDICTNAME_;
2526

0 commit comments

Comments
 (0)