Add some knowledge about prefix matches to tsmatchsel(). It's not terribly

tglsfdc · tglsfdc · commit 97532f7c2946 · 2010-08-01T21:31:08.000Z
bright, but it beats assuming that a prefix match behaves identically to an
exact match, which is what the code was doing before :-(.  Noted while
experimenting with Artur Dobrowski's example.
diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -257,93 +257,147 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
  *
  *	 1 - select(oper) in NOT nodes
  *
- *	 freq[val] in VAL nodes, if the value is in MCELEM
+ *	 histogram-based estimation in prefix VAL nodes
+ *
+ *	 freq[val] in exact VAL nodes, if the value is in MCELEM
  *	 min(freq[MCELEM]) / 2 in VAL nodes, if it is not
  *
  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
  * binary search for determining freq[MCELEM].
  *
  * If we don't have stats for the tsvector, we still use this logic,
- * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes.  This case
- * is signaled by lookup == NULL.
+ * except we use default estimates for VAL nodes.  This case is signaled
+ * by lookup == NULL.
  */
 static Selectivity
 tsquery_opr_selec(QueryItem *item, char *operand,
 				  TextFreq *lookup, int length, float4 minfreq)
 {
-	LexemeKey	key;
-	TextFreq   *searchres;
-	Selectivity selec,
-				s1,
-				s2;
+	Selectivity selec;
 
 	/* since this function recurses, it could be driven to stack overflow */
 	check_stack_depth();
 
 	if (item->type == QI_VAL)
 	{
 		QueryOperand *oper = (QueryOperand *) item;
-
-		/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
-		if (lookup == NULL)
-			return (Selectivity) DEFAULT_TS_MATCH_SEL;
+		LexemeKey	key;
 
 		/*
 		 * Prepare the key for bsearch().
 		 */
 		key.lexeme = operand + oper->distance;
 		key.length = oper->length;
 
-		searchres = (TextFreq *) bsearch(&key, lookup, length,
-										 sizeof(TextFreq),
-										 compare_lexeme_textfreq);
-
-		if (searchres)
+		if (oper->prefix)
 		{
+			/* Prefix match, ie the query item is lexeme:* */
+			Selectivity matched,
+						allmcvs;
+			int			i;
+
+			/*
+			 * Our strategy is to scan through the MCV list and add up the
+			 * frequencies of the ones that match the prefix, thereby
+			 * assuming that the MCVs are representative of the whole lexeme
+			 * population in this respect.  Compare histogram_selectivity().
+			 *
+			 * This is only a good plan if we have a pretty fair number of
+			 * MCVs available; we set the threshold at 100.  If no stats or
+			 * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
+			 */
+			if (lookup == NULL || length < 100)
+				return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
+			matched = allmcvs = 0;
+			for (i = 0; i < length; i++)
+			{
+				TextFreq   *t = lookup + i;
+				int			tlen = VARSIZE_ANY_EXHDR(t->element);
+
+				if (tlen >= key.length &&
+					strncmp(key.lexeme, VARDATA_ANY(t->element),
+							key.length) == 0)
+					matched += t->frequency;
+				allmcvs += t->frequency;
+			}
+
+			if (allmcvs > 0)	/* paranoia about zero divide */
+				selec = matched / allmcvs;
+			else
+				selec = (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
+
 			/*
-			 * The element is in MCELEM.  Return precise selectivity (or at
-			 * least as precise as ANALYZE could find out).
+			 * In any case, never believe that a prefix match has selectivity
+			 * less than DEFAULT_TS_MATCH_SEL.
 			 */
-			return (Selectivity) searchres->frequency;
+			selec = Max(DEFAULT_TS_MATCH_SEL, selec);
 		}
 		else
 		{
-			/*
-			 * The element is not in MCELEM.  Punt, but assume that the
-			 * selectivity cannot be more than minfreq / 2.
-			 */
-			return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+			/* Regular exact lexeme match */
+			TextFreq   *searchres;
+
+			/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
+			if (lookup == NULL)
+				return (Selectivity) DEFAULT_TS_MATCH_SEL;
+
+			searchres = (TextFreq *) bsearch(&key, lookup, length,
+											 sizeof(TextFreq),
+											 compare_lexeme_textfreq);
+
+			if (searchres)
+			{
+				/*
+				 * The element is in MCELEM.  Return precise selectivity (or
+				 * at least as precise as ANALYZE could find out).
+				 */
+				selec = searchres->frequency;
+			}
+			else
+			{
+				/*
+				 * The element is not in MCELEM.  Punt, but assume that the
+				 * selectivity cannot be more than minfreq / 2.
+				 */
+				selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
+			}
 		}
 	}
-
-	/* Current TSQuery node is an operator */
-	switch (item->qoperator.oper)
+	else
 	{
-		case OP_NOT:
-			selec = 1.0 - tsquery_opr_selec(item + 1, operand,
-											lookup, length, minfreq);
-			break;
-
-		case OP_AND:
-			s1 = tsquery_opr_selec(item + 1, operand,
-								   lookup, length, minfreq);
-			s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
-								   lookup, length, minfreq);
-			selec = s1 * s2;
-			break;
-
-		case OP_OR:
-			s1 = tsquery_opr_selec(item + 1, operand,
-								   lookup, length, minfreq);
-			s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
-								   lookup, length, minfreq);
-			selec = s1 + s2 - s1 * s2;
-			break;
-
-		default:
-			elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
-			selec = 0;			/* keep compiler quiet */
-			break;
+		/* Current TSQuery node is an operator */
+		Selectivity s1,
+					s2;
+
+		switch (item->qoperator.oper)
+		{
+			case OP_NOT:
+				selec = 1.0 - tsquery_opr_selec(item + 1, operand,
+												lookup, length, minfreq);
+				break;
+
+			case OP_AND:
+				s1 = tsquery_opr_selec(item + 1, operand,
+									   lookup, length, minfreq);
+				s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+									   lookup, length, minfreq);
+				selec = s1 * s2;
+				break;
+
+			case OP_OR:
+				s1 = tsquery_opr_selec(item + 1, operand,
+									   lookup, length, minfreq);
+				s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
+									   lookup, length, minfreq);
+				selec = s1 + s2 - s1 * s2;
+				break;
+
+			default:
+				elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
+				selec = 0;			/* keep compiler quiet */
+				break;
+		}
 	}
 
 	/* Clamp intermediate results to stay sane despite roundoff error */