Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 4e57668

Browse files
committed
Create a selectivity estimation function for the text search @@ operator.
Jan Urbanski
1 parent e2b7d0c commit 4e57668

File tree

9 files changed

+463
-30
lines changed

9 files changed

+463
-30
lines changed

doc/src/sgml/catalogs.sgml

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.174 2008/09/15 18:43:41 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.175 2008/09/19 19:03:40 tgl Exp $ -->
22
<!--
33
Documentation of the system catalogs, directed toward PostgreSQL developers
44
-->
@@ -6664,6 +6664,9 @@
66646664
A list of the frequencies of the most common values or elements,
66656665
i.e., number of occurrences of each divided by total number of rows.
66666666
(NULL when <structfield>most_common_vals</structfield> is.)
6667+
For some datatypes such as <type>tsvector</>, it can also store some
6668+
additional information, making it longer than the
6669+
<structfield>most_common_vals</> array.
66676670
</entry>
66686671
</row>
66696672

src/backend/tsearch/Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# Copyright (c) 2006-2008, PostgreSQL Global Development Group
66
#
7-
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.7 2008/07/14 00:51:45 tgl Exp $
7+
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.8 2008/09/19 19:03:40 tgl Exp $
88
#
99
#-------------------------------------------------------------------------
1010
subdir = src/backend/tsearch
@@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \
1919
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
2020
dict_simple.o dict_synonym.o dict_thesaurus.o \
2121
dict_ispell.o regis.o spell.o \
22-
to_tsany.o ts_typanalyze.o ts_utils.o
22+
to_tsany.o ts_selfuncs.o ts_typanalyze.o ts_utils.o
2323

2424
include $(top_srcdir)/src/backend/common.mk
2525

src/backend/tsearch/ts_selfuncs.c

+363
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,363 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* ts_selfuncs.c
4+
* Selectivity estimation functions for text search operators.
5+
*
6+
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7+
*
8+
*
9+
* IDENTIFICATION
10+
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.1 2008/09/19 19:03:40 tgl Exp $
11+
*
12+
*-------------------------------------------------------------------------
13+
*/
14+
#include "postgres.h"
15+
16+
#include "catalog/pg_statistic.h"
17+
#include "catalog/pg_type.h"
18+
#include "miscadmin.h"
19+
#include "nodes/nodes.h"
20+
#include "tsearch/ts_type.h"
21+
#include "utils/lsyscache.h"
22+
#include "utils/selfuncs.h"
23+
#include "utils/syscache.h"
24+
25+
26+
/*
27+
* The default text search selectivity is chosen to be small enough to
28+
* encourage indexscans for typical table densities. See selfuncs.h and
29+
* DEFAULT_EQ_SEL for details.
30+
*/
31+
#define DEFAULT_TS_MATCH_SEL 0.005
32+
33+
/* lookup table type for binary searching through MCELEMs */
34+
typedef struct
35+
{
36+
text *element;
37+
float4 frequency;
38+
} TextFreq;
39+
40+
/* type of keys for bsearch'ing through an array of TextFreqs */
41+
typedef struct
42+
{
43+
char *lexeme;
44+
int length;
45+
} LexemeKey;
46+
47+
static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
48+
static Selectivity mcelem_tsquery_selec(TSQuery query,
49+
Datum *mcelem, int nmcelem,
50+
float4 *numbers, int nnumbers);
51+
static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
52+
TextFreq *lookup, int length, float4 minfreq);
53+
static int compare_lexeme_textfreq(const void *e1, const void *e2);
54+
55+
56+
/*
57+
* tsmatchsel -- Selectivity of "@@"
58+
*
59+
* restriction selectivity function for tsvector @@ tsquery and
60+
* tsquery @@ tsvector
61+
*/
62+
Datum
63+
tsmatchsel(PG_FUNCTION_ARGS)
64+
{
65+
PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
66+
#ifdef NOT_USED
67+
Oid operator = PG_GETARG_OID(1);
68+
#endif
69+
List *args = (List *) PG_GETARG_POINTER(2);
70+
int varRelid = PG_GETARG_INT32(3);
71+
VariableStatData vardata;
72+
Node *other;
73+
bool varonleft;
74+
Selectivity selec;
75+
76+
/*
77+
* If expression is not variable = something or something = variable, then
78+
* punt and return a default estimate.
79+
*/
80+
if (!get_restriction_variable(root, args, varRelid,
81+
&vardata, &other, &varonleft))
82+
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
83+
84+
/*
85+
* Can't do anything useful if the something is not a constant, either.
86+
*/
87+
if (!IsA(other, Const))
88+
{
89+
ReleaseVariableStats(vardata);
90+
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
91+
}
92+
93+
/*
94+
* The "@@" operator is strict, so we can cope with NULL right away
95+
*/
96+
if (((Const *) other)->constisnull)
97+
{
98+
ReleaseVariableStats(vardata);
99+
PG_RETURN_FLOAT8(0.0);
100+
}
101+
102+
/*
103+
* OK, there's a Var and a Const we're dealing with here. We need the Var
104+
* to be a TSVector (or else we don't have any useful statistic for it).
105+
* We have to check this because the Var might be the TSQuery not the
106+
* TSVector.
107+
*/
108+
if (vardata.vartype == TSVECTOROID)
109+
{
110+
/* tsvector @@ tsquery or the other way around */
111+
Assert(((Const *) other)->consttype == TSQUERYOID);
112+
113+
selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
114+
}
115+
else
116+
{
117+
/* The Var is something we don't have useful statistics for */
118+
selec = DEFAULT_TS_MATCH_SEL;
119+
}
120+
121+
ReleaseVariableStats(vardata);
122+
123+
CLAMP_PROBABILITY(selec);
124+
125+
PG_RETURN_FLOAT8((float8) selec);
126+
}
127+
128+
129+
/*
130+
* tsmatchjoinsel -- join selectivity of "@@"
131+
*
132+
* join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
133+
*/
134+
Datum
135+
tsmatchjoinsel(PG_FUNCTION_ARGS)
136+
{
137+
/* for the moment we just punt */
138+
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
139+
}
140+
141+
142+
/*
143+
* @@ selectivity for tsvector var vs tsquery constant
144+
*/
145+
static Selectivity
146+
tsquerysel(VariableStatData *vardata, Datum constval)
147+
{
148+
Selectivity selec;
149+
150+
if (HeapTupleIsValid(vardata->statsTuple))
151+
{
152+
TSQuery query;
153+
Form_pg_statistic stats;
154+
Datum *values;
155+
int nvalues;
156+
float4 *numbers;
157+
int nnumbers;
158+
159+
/* The caller made sure the const is a TSQuery, so get it now */
160+
query = DatumGetTSQuery(constval);
161+
162+
stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
163+
164+
/* MCELEM will be an array of TEXT elements for a tsvector column */
165+
if (get_attstatsslot(vardata->statsTuple,
166+
TEXTOID, -1,
167+
STATISTIC_KIND_MCELEM, InvalidOid,
168+
&values, &nvalues,
169+
&numbers, &nnumbers))
170+
{
171+
/*
172+
* There is a most-common-elements slot for the tsvector Var, so
173+
* use that.
174+
*/
175+
selec = mcelem_tsquery_selec(query, values, nvalues,
176+
numbers, nnumbers);
177+
free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers);
178+
}
179+
else
180+
{
181+
/* No most-common-elements info, so we must punt */
182+
selec = (Selectivity) DEFAULT_TS_MATCH_SEL;
183+
}
184+
}
185+
else
186+
{
187+
/* No stats at all, so we must punt */
188+
selec = (Selectivity) DEFAULT_TS_MATCH_SEL;
189+
}
190+
191+
return selec;
192+
}
193+
194+
/*
195+
* Extract data from the pg_statistic arrays into useful format.
196+
*/
197+
static Selectivity
198+
mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
199+
float4 *numbers, int nnumbers)
200+
{
201+
float4 minfreq;
202+
TextFreq *lookup;
203+
Selectivity selec;
204+
int i;
205+
206+
/*
207+
* There should be two more Numbers than Values, because the last two
208+
* cells are taken for minimal and maximal frequency. Punt if not.
209+
*/
210+
if (nnumbers != nmcelem + 2)
211+
return DEFAULT_TS_MATCH_SEL;
212+
213+
/*
214+
* Transpose the data into a single array so we can use bsearch().
215+
*/
216+
lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
217+
for (i = 0; i < nmcelem; i++)
218+
{
219+
/*
220+
* The text Datums came from an array, so it cannot be compressed
221+
* or stored out-of-line -- it's safe to use VARSIZE_ANY*.
222+
*/
223+
Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
224+
lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
225+
lookup[i].frequency = numbers[i];
226+
}
227+
228+
/*
229+
* Grab the lowest frequency. compute_tsvector_stats() stored it for us in
230+
* the one before the last cell of the Numbers array. See ts_typanalyze.c
231+
*/
232+
minfreq = numbers[nnumbers - 2];
233+
234+
selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
235+
nmcelem, minfreq);
236+
237+
pfree(lookup);
238+
239+
return selec;
240+
}
241+
242+
/*
243+
* Traverse the tsquery in preorder, calculating selectivity as:
244+
*
245+
* selec(left_oper) * selec(right_oper) in AND nodes,
246+
*
247+
* selec(left_oper) + selec(right_oper) -
248+
* selec(left_oper) * selec(right_oper) in OR nodes,
249+
*
250+
* 1 - select(oper) in NOT nodes
251+
*
252+
* freq[val] in VAL nodes, if the value is in MCELEM
253+
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
254+
*
255+
*
256+
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
257+
* binary search for determining freq[MCELEM].
258+
*/
259+
static Selectivity
260+
tsquery_opr_selec(QueryItem *item, char *operand,
261+
TextFreq *lookup, int length, float4 minfreq)
262+
{
263+
LexemeKey key;
264+
TextFreq *searchres;
265+
Selectivity selec, s1, s2;
266+
267+
/* since this function recurses, it could be driven to stack overflow */
268+
check_stack_depth();
269+
270+
if (item->type == QI_VAL)
271+
{
272+
QueryOperand *oper = (QueryOperand *) item;
273+
274+
/*
275+
* Prepare the key for bsearch().
276+
*/
277+
key.lexeme = operand + oper->distance;
278+
key.length = oper->length;
279+
280+
searchres = (TextFreq *) bsearch(&key, lookup, length,
281+
sizeof(TextFreq),
282+
compare_lexeme_textfreq);
283+
284+
if (searchres)
285+
{
286+
/*
287+
* The element is in MCELEM. Return precise selectivity (or at
288+
* least as precise as ANALYZE could find out).
289+
*/
290+
return (Selectivity) searchres->frequency;
291+
}
292+
else
293+
{
294+
/*
295+
* The element is not in MCELEM. Punt, but assert that the
296+
* selectivity cannot be more than minfreq / 2.
297+
*/
298+
return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
299+
}
300+
}
301+
302+
/* Current TSQuery node is an operator */
303+
switch (item->operator.oper)
304+
{
305+
case OP_NOT:
306+
selec = 1.0 - tsquery_opr_selec(item + 1, operand,
307+
lookup, length, minfreq);
308+
break;
309+
310+
case OP_AND:
311+
s1 = tsquery_opr_selec(item + 1, operand,
312+
lookup, length, minfreq);
313+
s2 = tsquery_opr_selec(item + item->operator.left, operand,
314+
lookup, length, minfreq);
315+
selec = s1 * s2;
316+
break;
317+
318+
case OP_OR:
319+
s1 = tsquery_opr_selec(item + 1, operand,
320+
lookup, length, minfreq);
321+
s2 = tsquery_opr_selec(item + item->operator.left, operand,
322+
lookup, length, minfreq);
323+
selec = s1 + s2 - s1 * s2;
324+
break;
325+
326+
default:
327+
elog(ERROR, "unrecognized operator: %d", item->operator.oper);
328+
selec = 0; /* keep compiler quiet */
329+
break;
330+
}
331+
332+
/* Clamp intermediate results to stay sane despite roundoff error */
333+
CLAMP_PROBABILITY(selec);
334+
335+
return selec;
336+
}
337+
338+
/*
339+
* bsearch() comparator for a lexeme (non-NULL terminated string with length)
340+
* and a TextFreq. Use length, then byte-for-byte comparison, because that's
341+
* how ANALYZE code sorted data before storing it in a statistic tuple.
342+
* See ts_typanalyze.c for details.
343+
*/
344+
static int
345+
compare_lexeme_textfreq(const void *e1, const void *e2)
346+
{
347+
const LexemeKey *key = (const LexemeKey *) e1;
348+
const TextFreq *t = (const TextFreq *) e2;
349+
int len1,
350+
len2;
351+
352+
len1 = key->length;
353+
len2 = VARSIZE_ANY_EXHDR(t->element);
354+
355+
/* Compare lengths first, possibly avoiding a strncmp call */
356+
if (len1 > len2)
357+
return 1;
358+
else if (len1 < len2)
359+
return -1;
360+
361+
/* Fall back on byte-for-byte comparison */
362+
return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
363+
}

0 commit comments

Comments
 (0)