Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit eb08605

Browse files
committed
Make websearch_to_tsquery() parse text in quotes as a single token
websearch_to_tsquery() splits text in quotes into tokens and connects them with phrase operator on its own. However, that leads to surprising results when the token contains no words. For instance, websearch_to_tsquery('"aaa: bbb"') is 'aaa <2> bbb', because it is equivalent of to_tsquery(E'aaa <-> \':\' <-> bbb'). But websearch_to_tsquery('"aaa: bbb"') has to be 'aaa <-> bbb' in order to match to_tsvector('aaa: bbb'). Since 0c4f355, we anyway connect lexemes of complex tokens with phrase operators. Thus, let's just websearch_to_tsquery() parse text in quotes as a single token. Therefore, websearch_to_tsquery() should process the quoted text in the same way phraseto_tsquery() does. This solution is what we exactly need and also simplifies the code. This commit is an incompatible change, so we don't backpatch it. Reported-by: Valentin Gatien-Baron Discussion: https://postgr.es/m/CA%2B0DEqiZs7gdOd4ikmg%3D0UWG%2BSwWOLxPsk_JW-sx9WNOyrb0KQ%40mail.gmail.com Author: Alexander Korotkov Reviewed-by: Tom Lane, Zhihong Yu
1 parent 651d005 commit eb08605

File tree

3 files changed

+39
-67
lines changed

3 files changed

+39
-67
lines changed

src/backend/utils/adt/tsquery.c

+23-58
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ struct TSQueryParserStateData
7777
char *buf; /* current scan point */
7878
int count; /* nesting count, incremented by (,
7979
* decremented by ) */
80-
bool in_quotes; /* phrase in quotes "" */
8180
ts_parserstate state;
8281

8382
/* polish (prefix) notation in list, filled in by push* functions */
@@ -235,9 +234,6 @@ parse_or_operator(TSQueryParserState pstate)
235234
{
236235
char *ptr = pstate->buf;
237236

238-
if (pstate->in_quotes)
239-
return false;
240-
241237
/* it should begin with "OR" literal */
242238
if (pg_strncasecmp(ptr, "or", 2) != 0)
243239
return false;
@@ -398,38 +394,29 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
398394
state->buf++;
399395
state->state = WAITOPERAND;
400396

401-
if (state->in_quotes)
402-
continue;
403-
404397
*operator = OP_NOT;
405398
return PT_OPR;
406399
}
407400
else if (t_iseq(state->buf, '"'))
408401
{
402+
/* Everything in quotes is processed as a single token */
403+
404+
/* skip opening quote */
409405
state->buf++;
406+
*strval = state->buf;
410407

411-
if (!state->in_quotes)
412-
{
413-
state->state = WAITOPERAND;
408+
/* iterate to the closing quote or end of the string */
409+
while (*state->buf != '\0' && !t_iseq(state->buf, '"'))
410+
state->buf++;
411+
*lenval = state->buf - *strval;
414412

415-
if (strchr(state->buf, '"'))
416-
{
417-
/* quoted text should be ordered <-> */
418-
state->in_quotes = true;
419-
return PT_OPEN;
420-
}
413+
/* skip closing quote if not end of the string */
414+
if (*state->buf != '\0')
415+
state->buf++;
421416

422-
/* web search tolerates missing quotes */
423-
continue;
424-
}
425-
else
426-
{
427-
/* we have to provide an operand */
428-
state->in_quotes = false;
429-
state->state = WAITOPERATOR;
430-
pushStop(state);
431-
return PT_CLOSE;
432-
}
417+
state->state = WAITOPERATOR;
418+
state->count++;
419+
return PT_VAL;
433420
}
434421
else if (ISOPERATOR(state->buf))
435422
{
@@ -467,24 +454,13 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
467454
case WAITOPERATOR:
468455
if (t_iseq(state->buf, '"'))
469456
{
470-
if (!state->in_quotes)
471-
{
472-
/*
473-
* put implicit AND after an operand and handle this
474-
* quote in WAITOPERAND
475-
*/
476-
state->state = WAITOPERAND;
477-
*operator = OP_AND;
478-
return PT_OPR;
479-
}
480-
else
481-
{
482-
state->buf++;
483-
484-
/* just close quotes */
485-
state->in_quotes = false;
486-
return PT_CLOSE;
487-
}
457+
/*
458+
* put implicit AND after an operand and handle this quote
459+
* in WAITOPERAND
460+
*/
461+
state->state = WAITOPERAND;
462+
*operator = OP_AND;
463+
return PT_OPR;
488464
}
489465
else if (parse_or_operator(state))
490466
{
@@ -498,18 +474,8 @@ gettoken_query_websearch(TSQueryParserState state, int8 *operator,
498474
}
499475
else if (!t_isspace(state->buf))
500476
{
501-
if (state->in_quotes)
502-
{
503-
/* put implicit <-> after an operand */
504-
*operator = OP_PHRASE;
505-
*weight = 1;
506-
}
507-
else
508-
{
509-
/* put implicit AND after an operand */
510-
*operator = OP_AND;
511-
}
512-
477+
/* put implicit AND after an operand */
478+
*operator = OP_AND;
513479
state->state = WAITOPERAND;
514480
return PT_OPR;
515481
}
@@ -846,7 +812,6 @@ parse_tsquery(char *buf,
846812
state.buffer = buf;
847813
state.buf = buf;
848814
state.count = 0;
849-
state.in_quotes = false;
850815
state.state = WAITFIRSTOPERAND;
851816
state.polstr = NIL;
852817

src/test/regress/expected/tsearch.out

+15-9
Original file line numberDiff line numberDiff line change
@@ -2678,9 +2678,9 @@ select websearch_to_tsquery('simple', 'abc OR_abc');
26782678

26792679
-- test quotes
26802680
select websearch_to_tsquery('english', '"pg_class pg');
2681-
websearch_to_tsquery
2682-
-------------------------
2683-
'pg' <-> 'class' & 'pg'
2681+
websearch_to_tsquery
2682+
---------------------------
2683+
'pg' <-> 'class' <-> 'pg'
26842684
(1 row)
26852685

26862686
select websearch_to_tsquery('english', 'pg_class pg"');
@@ -2695,6 +2695,12 @@ select websearch_to_tsquery('english', '"pg_class pg"');
26952695
'pg' <-> 'class' <-> 'pg'
26962696
(1 row)
26972697

2698+
select websearch_to_tsquery('english', '"pg_class : pg"');
2699+
websearch_to_tsquery
2700+
---------------------------
2701+
'pg' <-> 'class' <-> 'pg'
2702+
(1 row)
2703+
26982704
select websearch_to_tsquery('english', 'abc "pg_class pg"');
26992705
websearch_to_tsquery
27002706
-----------------------------------
@@ -2708,15 +2714,15 @@ select websearch_to_tsquery('english', '"pg_class pg" def');
27082714
(1 row)
27092715

27102716
select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');
2711-
websearch_to_tsquery
2712-
--------------------------------------------------------
2713-
'abc' & 'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg' & 'def'
2717+
websearch_to_tsquery
2718+
----------------------------------------------------
2719+
'abc' & 'pg' <-> 'pg' <-> 'class' <-> 'pg' & 'def'
27142720
(1 row)
27152721

27162722
select websearch_to_tsquery('english', ' or "pg pg_class pg" or ');
2717-
websearch_to_tsquery
2718-
----------------------------------------
2719-
'pg' <-> ( 'pg' <-> 'class' ) <-> 'pg'
2723+
websearch_to_tsquery
2724+
------------------------------------
2725+
'pg' <-> 'pg' <-> 'class' <-> 'pg'
27202726
(1 row)
27212727

27222728
select websearch_to_tsquery('english', '""pg pg_class pg""');

src/test/regress/sql/tsearch.sql

+1
Original file line numberDiff line numberDiff line change
@@ -759,6 +759,7 @@ select websearch_to_tsquery('simple', 'abc OR_abc');
759759
select websearch_to_tsquery('english', '"pg_class pg');
760760
select websearch_to_tsquery('english', 'pg_class pg"');
761761
select websearch_to_tsquery('english', '"pg_class pg"');
762+
select websearch_to_tsquery('english', '"pg_class : pg"');
762763
select websearch_to_tsquery('english', 'abc "pg_class pg"');
763764
select websearch_to_tsquery('english', '"pg_class pg" def');
764765
select websearch_to_tsquery('english', 'abc "pg pg_class pg" def');

0 commit comments

Comments
 (0)