Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 73e6f9d

Browse files
committed
Change text search parsing rules for hyphenated words so that digit strings
containing decimal points aren't considered part of a hyphenated word. Sync the hyphenated-word lookahead states with the subsequent part-by-part reparsing states so that we don't get different answers about how much text is part of the hyphenated word. Per my gripe of a few days ago.
1 parent 1aaf39b commit 73e6f9d

File tree

2 files changed

+21
-82
lines changed

2 files changed

+21
-82
lines changed

src/backend/tsearch/wparser_def.c

+13-70
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.6 2007/10/27 17:53:15 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -181,19 +181,13 @@ typedef enum
181181
TPS_InHyphenWord,
182182
TPS_InHyphenNumWordFirst,
183183
TPS_InHyphenNumWord,
184-
TPS_InHyphenValueFirst,
185-
TPS_InHyphenValue,
186-
TPS_InHyphenValueExact,
184+
TPS_InHyphenDigitLookahead,
187185
TPS_InParseHyphen,
188186
TPS_InParseHyphenHyphen,
189187
TPS_InHyphenWordPart,
190188
TPS_InHyphenAsciiWordPart,
191189
TPS_InHyphenNumWordPart,
192190
TPS_InHyphenUnsignedInt,
193-
TPS_InHDecimalPartFirst,
194-
TPS_InHDecimalPart,
195-
TPS_InHVersionPartFirst,
196-
TPS_InHVersionPart,
197191
TPS_Null /* last state (fake value) */
198192
} TParserState;
199193

@@ -1147,8 +1141,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
11471141
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
11481142
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
11491143
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1150-
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
1151-
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1144+
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
11521145
{NULL, 0, A_POP, TPS_Null, 0, NULL}
11531146
};
11541147

@@ -1164,8 +1157,7 @@ static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
11641157
static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
11651158
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
11661159
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1167-
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
1168-
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1160+
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
11691161
{NULL, 0, A_POP, TPS_Null, 0, NULL}
11701162
};
11711163

@@ -1179,8 +1171,8 @@ static const TParserStateActionItem actionTPS_InHyphenWord[] = {
11791171

11801172
static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
11811173
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1182-
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
11831174
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1175+
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
11841176
{NULL, 0, A_POP, TPS_Null, 0, NULL}
11851177
};
11861178

@@ -1191,34 +1183,18 @@ static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
11911183
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
11921184
};
11931185

1194-
static const TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
1186+
static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
11951187
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1196-
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
1197-
{NULL, 0, A_POP, TPS_Null, 0, NULL}
1198-
};
1199-
1200-
static const TParserStateActionItem actionTPS_InHyphenValue[] = {
1201-
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1202-
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
1203-
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
1204-
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1188+
{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
12051189
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1206-
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1207-
};
1208-
1209-
static const TParserStateActionItem actionTPS_InHyphenValueExact[] = {
1210-
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1211-
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
1212-
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
1213-
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1214-
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1190+
{NULL, 0, A_POP, TPS_Null, 0, NULL}
12151191
};
12161192

12171193
static const TParserStateActionItem actionTPS_InParseHyphen[] = {
12181194
{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
12191195
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
12201196
{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1221-
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
1197+
{p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
12221198
{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
12231199
{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
12241200
};
@@ -1251,39 +1227,12 @@ static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
12511227
};
12521228

12531229
static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1254-
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1255-
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
1256-
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1257-
{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
1258-
{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
1259-
};
1260-
1261-
static const TParserStateActionItem actionTPS_InHDecimalPartFirst[] = {
1262-
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1263-
{p_isdigit, 0, A_CLEAR, TPS_InHDecimalPart, 0, NULL},
1264-
{NULL, 0, A_POP, TPS_Null, 0, NULL}
1265-
};
1266-
1267-
static const TParserStateActionItem actionTPS_InHDecimalPart[] = {
1268-
{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL, NULL},
1269-
{p_isdigit, 0, A_NEXT, TPS_InHDecimalPart, 0, NULL},
1270-
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
1271-
{NULL, 0, A_BINGO, TPS_InParseHyphen, DECIMAL, NULL}
1272-
};
1273-
1274-
static const TParserStateActionItem actionTPS_InHVersionPartFirst[] = {
12751230
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1276-
{p_isdigit, 0, A_CLEAR, TPS_InHVersionPart, 0, NULL},
1231+
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1232+
{p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
12771233
{NULL, 0, A_POP, TPS_Null, 0, NULL}
12781234
};
12791235

1280-
static const TParserStateActionItem actionTPS_InHVersionPart[] = {
1281-
{p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1282-
{p_isdigit, 0, A_NEXT, TPS_InHVersionPart, 0, NULL},
1283-
{p_iseqC, '.', A_PUSH, TPS_InHVersionPartFirst, 0, NULL},
1284-
{NULL, 0, A_BINGO, TPS_InParseHyphen, VERSIONNUMBER, NULL}
1285-
};
1286-
12871236

12881237
/*
12891238
* main table of per-state parser actions
@@ -1378,19 +1327,13 @@ static const TParserStateAction Actions[] = {
13781327
TPARSERSTATEACTION(TPS_InHyphenWord),
13791328
TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
13801329
TPARSERSTATEACTION(TPS_InHyphenNumWord),
1381-
TPARSERSTATEACTION(TPS_InHyphenValueFirst),
1382-
TPARSERSTATEACTION(TPS_InHyphenValue),
1383-
TPARSERSTATEACTION(TPS_InHyphenValueExact),
1330+
TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
13841331
TPARSERSTATEACTION(TPS_InParseHyphen),
13851332
TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
13861333
TPARSERSTATEACTION(TPS_InHyphenWordPart),
13871334
TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
13881335
TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1389-
TPARSERSTATEACTION(TPS_InHyphenUnsignedInt),
1390-
TPARSERSTATEACTION(TPS_InHDecimalPartFirst),
1391-
TPARSERSTATEACTION(TPS_InHDecimalPart),
1392-
TPARSERSTATEACTION(TPS_InHVersionPartFirst),
1393-
TPARSERSTATEACTION(TPS_InHVersionPart)
1336+
TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
13941337
};
13951338

13961339

src/test/regress/expected/tsearch.out

+8-12
Original file line numberDiff line numberDiff line change
@@ -352,15 +352,11 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
352352
12 | .
353353
20 | 4.2
354354
12 | ,
355-
15 | readline-4.2
356-
11 | readline
357-
12 | -
358-
20 | 4.2
355+
1 | readline
356+
20 | -4.2
359357
12 |
360-
15 | readline-4.2
361-
11 | readline
362-
12 | -
363-
20 | 4.2
358+
1 | readline
359+
20 | -4.2
364360
12 | .
365361
22 | 234
366362
12 |
@@ -377,14 +373,14 @@ SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.w
377373
12 |
378374
12 | <>
379375
1 | qwerty
380-
(135 rows)
376+
(131 rows)
381377

382378
SELECT to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
383379
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
384380
<i <b> wow < jqw <> qwerty');
385-
to_tsvector
386-
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
387-
'ad':17 'dw':19 'jf':39 '234':63 '345':1 '4.2':54,55,56,59,62 '455':31 'jqw':66 'qwe':2,18,27,28,35 'wer':36 'wow':65 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':67 '234.435':30 'qwe-wer':34 'readlin':53,58,61 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 'readline-4.2':57,60 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
381+
to_tsvector
382+
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
383+
'ad':17 'dw':19 'jf':39 '234':61 '345':1 '4.2':54,55,56 '455':31 'jqw':64 'qwe':2,18,27,28,35 'wer':36 'wow':63 '-4.2':58,60 'asdf':37 'ewr1':43 'qwer':38 'sdjk':40 '5.005':32 'efd.r':3 'ewri2':44 'hjwer':42 'qwqwe':29 'wefjn':48 'gist.c':52 'gist.h':50 'qwerti':65 '234.435':30 'qwe-wer':34 'readlin':53,57,59 'www.com':4 '+4.0e-10':26 'gist.h.c':51 'rewt/ewr':47 '/?ad=qwe&dw':7,10,14,22 '/wqe-324/ewr':49 'aew.werc.ewr':6 '1aew.werc.ewr':9 '2aew.werc.ewr':11 '3aew.werc.ewr':13 '4aew.werc.ewr':15 '/usr/local/fff':45 '/awdf/dwqe/4325':46 'teodor@stack.net':33 '/?ad=qwe&dw=%20%32':25 '5aew.werc.ewr:8100':16 '6aew.werc.ewr:8100':21 '7aew.werc.ewr:8100':24 'aew.werc.ewr/?ad=qwe&dw':5 '1aew.werc.ewr/?ad=qwe&dw':8 '3aew.werc.ewr/?ad=qwe&dw':12 '6aew.werc.ewr:8100/?ad=qwe&dw':20 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':23
388384
(1 row)
389385

390386
SELECT length(to_tsvector('english', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">

0 commit comments

Comments
 (0)