|
7 | 7 | *
|
8 | 8 | *
|
9 | 9 | * IDENTIFICATION
|
10 |
| - * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $ |
| 10 | + * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $ |
11 | 11 | *
|
12 | 12 | *-------------------------------------------------------------------------
|
13 | 13 | */
|
@@ -257,93 +257,147 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
|
257 | 257 | *
|
258 | 258 | * 1 - select(oper) in NOT nodes
|
259 | 259 | *
|
260 |
| - * freq[val] in VAL nodes, if the value is in MCELEM |
| 260 | + * histogram-based estimation in prefix VAL nodes |
| 261 | + * |
| 262 | + * freq[val] in exact VAL nodes, if the value is in MCELEM |
261 | 263 | * min(freq[MCELEM]) / 2 in VAL nodes, if it is not
|
262 | 264 | *
|
263 | 265 | * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
|
264 | 266 | * binary search for determining freq[MCELEM].
|
265 | 267 | *
|
266 | 268 | * If we don't have stats for the tsvector, we still use this logic,
|
267 |
| - * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes. This case |
268 |
| - * is signaled by lookup == NULL. |
| 269 | + * except we use default estimates for VAL nodes. This case is signaled |
| 270 | + * by lookup == NULL. |
269 | 271 | */
|
270 | 272 | static Selectivity
|
271 | 273 | tsquery_opr_selec(QueryItem *item, char *operand,
|
272 | 274 | TextFreq *lookup, int length, float4 minfreq)
|
273 | 275 | {
|
274 |
| - LexemeKey key; |
275 |
| - TextFreq *searchres; |
276 |
| - Selectivity selec, |
277 |
| - s1, |
278 |
| - s2; |
| 276 | + Selectivity selec; |
279 | 277 |
|
280 | 278 | /* since this function recurses, it could be driven to stack overflow */
|
281 | 279 | check_stack_depth();
|
282 | 280 |
|
283 | 281 | if (item->type == QI_VAL)
|
284 | 282 | {
|
285 | 283 | QueryOperand *oper = (QueryOperand *) item;
|
286 |
| - |
287 |
| - /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ |
288 |
| - if (lookup == NULL) |
289 |
| - return (Selectivity) DEFAULT_TS_MATCH_SEL; |
| 284 | + LexemeKey key; |
290 | 285 |
|
291 | 286 | /*
|
292 | 287 | * Prepare the key for bsearch().
|
293 | 288 | */
|
294 | 289 | key.lexeme = operand + oper->distance;
|
295 | 290 | key.length = oper->length;
|
296 | 291 |
|
297 |
| - searchres = (TextFreq *) bsearch(&key, lookup, length, |
298 |
| - sizeof(TextFreq), |
299 |
| - compare_lexeme_textfreq); |
300 |
| - |
301 |
| - if (searchres) |
| 292 | + if (oper->prefix) |
302 | 293 | {
|
| 294 | + /* Prefix match, ie the query item is lexeme:* */ |
| 295 | + Selectivity matched, |
| 296 | + allmcvs; |
| 297 | + int i; |
| 298 | + |
| 299 | + /* |
| 300 | + * Our strategy is to scan through the MCV list and add up the |
| 301 | + * frequencies of the ones that match the prefix, thereby |
| 302 | + * assuming that the MCVs are representative of the whole lexeme |
| 303 | + * population in this respect. Compare histogram_selectivity(). |
| 304 | + * |
| 305 | + * This is only a good plan if we have a pretty fair number of |
| 306 | + * MCVs available; we set the threshold at 100. If no stats or |
| 307 | + * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4. |
| 308 | + */ |
| 309 | + if (lookup == NULL || length < 100) |
| 310 | + return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4); |
| 311 | + |
| 312 | + matched = allmcvs = 0; |
| 313 | + for (i = 0; i < length; i++) |
| 314 | + { |
| 315 | + TextFreq *t = lookup + i; |
| 316 | + int tlen = VARSIZE_ANY_EXHDR(t->element); |
| 317 | + |
| 318 | + if (tlen >= key.length && |
| 319 | + strncmp(key.lexeme, VARDATA_ANY(t->element), |
| 320 | + key.length) == 0) |
| 321 | + matched += t->frequency; |
| 322 | + allmcvs += t->frequency; |
| 323 | + } |
| 324 | + |
| 325 | + if (allmcvs > 0) /* paranoia about zero divide */ |
| 326 | + selec = matched / allmcvs; |
| 327 | + else |
| 328 | + selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4); |
| 329 | + |
303 | 330 | /*
|
304 |
| - * The element is in MCELEM. Return precise selectivity (or at |
305 |
| - * least as precise as ANALYZE could find out). |
| 331 | + * In any case, never believe that a prefix match has selectivity |
| 332 | + * less than DEFAULT_TS_MATCH_SEL. |
306 | 333 | */
|
307 |
| - return (Selectivity) searchres->frequency; |
| 334 | + selec = Max(DEFAULT_TS_MATCH_SEL, selec); |
308 | 335 | }
|
309 | 336 | else
|
310 | 337 | {
|
311 |
| - /* |
312 |
| - * The element is not in MCELEM. Punt, but assume that the |
313 |
| - * selectivity cannot be more than minfreq / 2. |
314 |
| - */ |
315 |
| - return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); |
| 338 | + /* Regular exact lexeme match */ |
| 339 | + TextFreq *searchres; |
| 340 | + |
| 341 | + /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ |
| 342 | + if (lookup == NULL) |
| 343 | + return (Selectivity) DEFAULT_TS_MATCH_SEL; |
| 344 | + |
| 345 | + searchres = (TextFreq *) bsearch(&key, lookup, length, |
| 346 | + sizeof(TextFreq), |
| 347 | + compare_lexeme_textfreq); |
| 348 | + |
| 349 | + if (searchres) |
| 350 | + { |
| 351 | + /* |
| 352 | + * The element is in MCELEM. Return precise selectivity (or |
| 353 | + * at least as precise as ANALYZE could find out). |
| 354 | + */ |
| 355 | + selec = searchres->frequency; |
| 356 | + } |
| 357 | + else |
| 358 | + { |
| 359 | + /* |
| 360 | + * The element is not in MCELEM. Punt, but assume that the |
| 361 | + * selectivity cannot be more than minfreq / 2. |
| 362 | + */ |
| 363 | + selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2); |
| 364 | + } |
316 | 365 | }
|
317 | 366 | }
|
318 |
| - |
319 |
| - /* Current TSQuery node is an operator */ |
320 |
| - switch (item->qoperator.oper) |
| 367 | + else |
321 | 368 | {
|
322 |
| - case OP_NOT: |
323 |
| - selec = 1.0 - tsquery_opr_selec(item + 1, operand, |
324 |
| - lookup, length, minfreq); |
325 |
| - break; |
326 |
| - |
327 |
| - case OP_AND: |
328 |
| - s1 = tsquery_opr_selec(item + 1, operand, |
329 |
| - lookup, length, minfreq); |
330 |
| - s2 = tsquery_opr_selec(item + item->qoperator.left, operand, |
331 |
| - lookup, length, minfreq); |
332 |
| - selec = s1 * s2; |
333 |
| - break; |
334 |
| - |
335 |
| - case OP_OR: |
336 |
| - s1 = tsquery_opr_selec(item + 1, operand, |
337 |
| - lookup, length, minfreq); |
338 |
| - s2 = tsquery_opr_selec(item + item->qoperator.left, operand, |
339 |
| - lookup, length, minfreq); |
340 |
| - selec = s1 + s2 - s1 * s2; |
341 |
| - break; |
342 |
| - |
343 |
| - default: |
344 |
| - elog(ERROR, "unrecognized operator: %d", item->qoperator.oper); |
345 |
| - selec = 0; /* keep compiler quiet */ |
346 |
| - break; |
| 369 | + /* Current TSQuery node is an operator */ |
| 370 | + Selectivity s1, |
| 371 | + s2; |
| 372 | + |
| 373 | + switch (item->qoperator.oper) |
| 374 | + { |
| 375 | + case OP_NOT: |
| 376 | + selec = 1.0 - tsquery_opr_selec(item + 1, operand, |
| 377 | + lookup, length, minfreq); |
| 378 | + break; |
| 379 | + |
| 380 | + case OP_AND: |
| 381 | + s1 = tsquery_opr_selec(item + 1, operand, |
| 382 | + lookup, length, minfreq); |
| 383 | + s2 = tsquery_opr_selec(item + item->qoperator.left, operand, |
| 384 | + lookup, length, minfreq); |
| 385 | + selec = s1 * s2; |
| 386 | + break; |
| 387 | + |
| 388 | + case OP_OR: |
| 389 | + s1 = tsquery_opr_selec(item + 1, operand, |
| 390 | + lookup, length, minfreq); |
| 391 | + s2 = tsquery_opr_selec(item + item->qoperator.left, operand, |
| 392 | + lookup, length, minfreq); |
| 393 | + selec = s1 + s2 - s1 * s2; |
| 394 | + break; |
| 395 | + |
| 396 | + default: |
| 397 | + elog(ERROR, "unrecognized operator: %d", item->qoperator.oper); |
| 398 | + selec = 0; /* keep compiler quiet */ |
| 399 | + break; |
| 400 | + } |
347 | 401 | }
|
348 | 402 |
|
349 | 403 | /* Clamp intermediate results to stay sane despite roundoff error */
|
|
0 commit comments