Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 97532f7

Browse files
committed
Add some knowledge about prefix matches to tsmatchsel(). It's not terribly
bright, but it beats assuming that a prefix match behaves identically to an exact match, which is what the code was doing before :-(. Noted while experimenting with Artur Dobrowski's example.
1 parent d4fe61b commit 97532f7

File tree

1 file changed

+108
-54
lines changed

1 file changed

+108
-54
lines changed

src/backend/tsearch/ts_selfuncs.c

+108-54
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -257,93 +257,147 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
257257
*
258258
* 1 - select(oper) in NOT nodes
259259
*
260-
* freq[val] in VAL nodes, if the value is in MCELEM
260+
* histogram-based estimation in prefix VAL nodes
261+
*
262+
* freq[val] in exact VAL nodes, if the value is in MCELEM
261263
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
262264
*
263265
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
264266
* binary search for determining freq[MCELEM].
265267
*
266268
* If we don't have stats for the tsvector, we still use this logic,
267-
* except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case
268-
* is signaled by lookup == NULL.
269+
* except we use default estimates for VAL nodes. This case is signaled
270+
* by lookup == NULL.
269271
*/
270272
static Selectivity
271273
tsquery_opr_selec(QueryItem *item, char *operand,
272274
TextFreq *lookup, int length, float4 minfreq)
273275
{
274-
LexemeKey key;
275-
TextFreq *searchres;
276-
Selectivity selec,
277-
s1,
278-
s2;
276+
Selectivity selec;
279277

280278
/* since this function recurses, it could be driven to stack overflow */
281279
check_stack_depth();
282280

283281
if (item->type == QI_VAL)
284282
{
285283
QueryOperand *oper = (QueryOperand *) item;
286-
287-
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
288-
if (lookup == NULL)
289-
return (Selectivity) DEFAULT_TS_MATCH_SEL;
284+
LexemeKey key;
290285

291286
/*
292287
* Prepare the key for bsearch().
293288
*/
294289
key.lexeme = operand + oper->distance;
295290
key.length = oper->length;
296291

297-
searchres = (TextFreq *) bsearch(&key, lookup, length,
298-
sizeof(TextFreq),
299-
compare_lexeme_textfreq);
300-
301-
if (searchres)
292+
if (oper->prefix)
302293
{
294+
/* Prefix match, ie the query item is lexeme:* */
295+
Selectivity matched,
296+
allmcvs;
297+
int i;
298+
299+
/*
300+
* Our strategy is to scan through the MCV list and add up the
301+
* frequencies of the ones that match the prefix, thereby
302+
* assuming that the MCVs are representative of the whole lexeme
303+
* population in this respect. Compare histogram_selectivity().
304+
*
305+
* This is only a good plan if we have a pretty fair number of
306+
* MCVs available; we set the threshold at 100. If no stats or
307+
* insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
308+
*/
309+
if (lookup == NULL || length < 100)
310+
return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
311+
312+
matched = allmcvs = 0;
313+
for (i = 0; i < length; i++)
314+
{
315+
TextFreq *t = lookup + i;
316+
int tlen = VARSIZE_ANY_EXHDR(t->element);
317+
318+
if (tlen >= key.length &&
319+
strncmp(key.lexeme, VARDATA_ANY(t->element),
320+
key.length) == 0)
321+
matched += t->frequency;
322+
allmcvs += t->frequency;
323+
}
324+
325+
if (allmcvs > 0) /* paranoia about zero divide */
326+
selec = matched / allmcvs;
327+
else
328+
selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
329+
303330
/*
304-
* The element is in MCELEM. Return precise selectivity (or at
305-
* least as precise as ANALYZE could find out).
331+
* In any case, never believe that a prefix match has selectivity
332+
* less than DEFAULT_TS_MATCH_SEL.
306333
*/
307-
return (Selectivity) searchres->frequency;
334+
selec = Max(DEFAULT_TS_MATCH_SEL, selec);
308335
}
309336
else
310337
{
311-
/*
312-
* The element is not in MCELEM. Punt, but assume that the
313-
* selectivity cannot be more than minfreq / 2.
314-
*/
315-
return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
338+
/* Regular exact lexeme match */
339+
TextFreq *searchres;
340+
341+
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
342+
if (lookup == NULL)
343+
return (Selectivity) DEFAULT_TS_MATCH_SEL;
344+
345+
searchres = (TextFreq *) bsearch(&key, lookup, length,
346+
sizeof(TextFreq),
347+
compare_lexeme_textfreq);
348+
349+
if (searchres)
350+
{
351+
/*
352+
* The element is in MCELEM. Return precise selectivity (or
353+
* at least as precise as ANALYZE could find out).
354+
*/
355+
selec = searchres->frequency;
356+
}
357+
else
358+
{
359+
/*
360+
* The element is not in MCELEM. Punt, but assume that the
361+
* selectivity cannot be more than minfreq / 2.
362+
*/
363+
selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
364+
}
316365
}
317366
}
318-
319-
/* Current TSQuery node is an operator */
320-
switch (item->qoperator.oper)
367+
else
321368
{
322-
case OP_NOT:
323-
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
324-
lookup, length, minfreq);
325-
break;
326-
327-
case OP_AND:
328-
s1 = tsquery_opr_selec(item + 1, operand,
329-
lookup, length, minfreq);
330-
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
331-
lookup, length, minfreq);
332-
selec = s1 * s2;
333-
break;
334-
335-
case OP_OR:
336-
s1 = tsquery_opr_selec(item + 1, operand,
337-
lookup, length, minfreq);
338-
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
339-
lookup, length, minfreq);
340-
selec = s1 + s2 - s1 * s2;
341-
break;
342-
343-
default:
344-
elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
345-
selec = 0; /* keep compiler quiet */
346-
break;
369+
/* Current TSQuery node is an operator */
370+
Selectivity s1,
371+
s2;
372+
373+
switch (item->qoperator.oper)
374+
{
375+
case OP_NOT:
376+
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
377+
lookup, length, minfreq);
378+
break;
379+
380+
case OP_AND:
381+
s1 = tsquery_opr_selec(item + 1, operand,
382+
lookup, length, minfreq);
383+
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
384+
lookup, length, minfreq);
385+
selec = s1 * s2;
386+
break;
387+
388+
case OP_OR:
389+
s1 = tsquery_opr_selec(item + 1, operand,
390+
lookup, length, minfreq);
391+
s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
392+
lookup, length, minfreq);
393+
selec = s1 + s2 - s1 * s2;
394+
break;
395+
396+
default:
397+
elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
398+
selec = 0; /* keep compiler quiet */
399+
break;
400+
}
347401
}
348402

349403
/* Clamp intermediate results to stay sane despite roundoff error */

0 commit comments

Comments
 (0)