Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 6cda3ad

Browse files
committed
Cause planner to make use of average-column-width statistic that is now
collected by ANALYZE. Also, add some modest amount of intelligence to guesses that are used for varlena columns in the absence of any ANALYZE statistics. The 'width' reported by EXPLAIN is finally something less than totally bogus for varlena columns ... and, in consequence, hashjoin estimating should be a little better ...
1 parent b66a0cc commit 6cda3ad

File tree

4 files changed

+124
-42
lines changed

4 files changed

+124
-42
lines changed

doc/src/sgml/perform.sgml

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<!--
2-
$Header: /cvsroot/pgsql/doc/src/sgml/perform.sgml,v 1.3 2001/03/24 23:03:26 petere Exp $
2+
$Header: /cvsroot/pgsql/doc/src/sgml/perform.sgml,v 1.4 2001/05/09 00:35:09 tgl Exp $
33
-->
44

55
<chapter id="performance-tips">
@@ -97,13 +97,6 @@ $Header: /cvsroot/pgsql/doc/src/sgml/perform.sgml,v 1.3 2001/03/24 23:03:26 pete
9797
by the query (again, without considering the effects of LIMIT).
9898
</para>
9999

100-
<para>
101-
Average width is pretty bogus because the thing really doesn't have
102-
any idea of the average length of variable-length columns. I'm thinking
103-
about improving that in the future, but it may not be worth the trouble,
104-
because the width isn't used for very much.
105-
</para>
106-
107100
<para>
108101
Here are some examples (using the regress test database after a
109102
vacuum analyze, and almost-7.0 sources):

src/backend/optimizer/path/costsize.c

Lines changed: 43 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
* Portions Copyright (c) 1994, Regents of the University of California
4242
*
4343
* IDENTIFICATION
44-
* $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.71 2001/05/07 00:43:20 tgl Exp $
44+
* $Header: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v 1.72 2001/05/09 00:35:09 tgl Exp $
4545
*
4646
*-------------------------------------------------------------------------
4747
*/
@@ -61,12 +61,6 @@
6161
#include "utils/syscache.h"
6262

6363

64-
/*
65-
* The length of a variable-length field in bytes (stupid estimate...)
66-
*/
67-
#define _DEFAULT_ATTRIBUTE_WIDTH_ 12
68-
69-
7064
#define LOG2(x) (log(x) / 0.693147180559945)
7165
#define LOG6(x) (log(x) / 1.79175946922805)
7266

@@ -90,7 +84,6 @@ bool enable_hashjoin = true;
9084

9185
static bool cost_qual_eval_walker(Node *node, Cost *total);
9286
static void set_rel_width(Query *root, RelOptInfo *rel);
93-
static int compute_attribute_width(TargetEntry *tlistentry);
9487
static double relation_byte_size(double tuples, int width);
9588
static double page_size(double tuples, int width);
9689

@@ -1082,36 +1075,54 @@ set_joinrel_size_estimates(Query *root, RelOptInfo *rel,
10821075
/*
10831076
* set_rel_width
10841077
* Set the estimated output width of the relation.
1078+
*
1079+
* NB: this works best on base relations because it prefers to look at
1080+
* real Vars. It will fail to make use of pg_statistic info when applied
1081+
* to a subquery relation, even if the subquery outputs are simple vars
1082+
* that we could have gotten info for. Is it worth trying to be smarter
1083+
* about subqueries?
10851084
*/
10861085
static void
10871086
set_rel_width(Query *root, RelOptInfo *rel)
10881087
{
1089-
int tuple_width = 0;
1090-
List *tle;
1088+
int32 tuple_width = 0;
1089+
List *tllist;
10911090

1092-
foreach(tle, rel->targetlist)
1093-
tuple_width += compute_attribute_width((TargetEntry *) lfirst(tle));
1094-
Assert(tuple_width >= 0);
1095-
rel->width = tuple_width;
1096-
}
1091+
foreach(tllist, rel->targetlist)
1092+
{
1093+
TargetEntry *tle = (TargetEntry *) lfirst(tllist);
1094+
int32 item_width;
10971095

1098-
/*
1099-
* compute_attribute_width
1100-
* Given a target list entry, find the size in bytes of the attribute.
1101-
*
1102-
* If a field is variable-length, we make a default assumption. Would be
1103-
* better if VACUUM recorded some stats about the average field width...
1104-
* also, we have access to the atttypmod, but fail to use it...
1105-
*/
1106-
static int
1107-
compute_attribute_width(TargetEntry *tlistentry)
1108-
{
1109-
int width = get_typlen(tlistentry->resdom->restype);
1096+
/*
1097+
* If it's a Var, try to get statistical info from pg_statistic.
1098+
*/
1099+
if (tle->expr && IsA(tle->expr, Var))
1100+
{
1101+
Var *var = (Var *) tle->expr;
1102+
Oid relid;
11101103

1111-
if (width < 0)
1112-
return _DEFAULT_ATTRIBUTE_WIDTH_;
1113-
else
1114-
return width;
1104+
relid = getrelid(var->varno, root->rtable);
1105+
if (relid != InvalidOid)
1106+
{
1107+
item_width = get_attavgwidth(relid, var->varattno);
1108+
if (item_width > 0)
1109+
{
1110+
tuple_width += item_width;
1111+
continue;
1112+
}
1113+
}
1114+
}
1115+
/*
1116+
* Not a Var, or can't find statistics for it. Estimate using
1117+
* just the type info.
1118+
*/
1119+
item_width = get_typavgwidth(tle->resdom->restype,
1120+
tle->resdom->restypmod);
1121+
Assert(item_width > 0);
1122+
tuple_width += item_width;
1123+
}
1124+
Assert(tuple_width >= 0);
1125+
rel->width = tuple_width;
11151126
}
11161127

11171128
/*
@@ -1122,7 +1133,7 @@ compute_attribute_width(TargetEntry *tlistentry)
11221133
static double
11231134
relation_byte_size(double tuples, int width)
11241135
{
1125-
return tuples * ((double) (width + sizeof(HeapTupleData)));
1136+
return tuples * ((double) MAXALIGN(width + sizeof(HeapTupleData)));
11261137
}
11271138

11281139
/*

src/backend/utils/cache/lsyscache.c

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
99
* IDENTIFICATION
10-
* $Header: /cvsroot/pgsql/src/backend/utils/cache/lsyscache.c,v 1.53 2001/05/07 00:43:24 tgl Exp $
10+
* $Header: /cvsroot/pgsql/src/backend/utils/cache/lsyscache.c,v 1.54 2001/05/09 00:35:09 tgl Exp $
1111
*
1212
* NOTES
1313
* Eventually, the index information should go through here, too.
@@ -749,6 +749,56 @@ get_typdefault(Oid typid)
749749
return returnValue;
750750
}
751751

752+
/*
753+
* get_typavgwidth
754+
*
755+
* Given a type OID and a typmod value (pass -1 if typmod is unknown),
756+
* estimate the average width of values of the type. This is used by
757+
* the planner, which doesn't require absolutely correct results;
758+
* it's OK (and expected) to guess if we don't know for sure.
759+
*/
760+
int32
761+
get_typavgwidth(Oid typid, int32 typmod)
762+
{
763+
int typlen = get_typlen(typid);
764+
int32 maxwidth;
765+
766+
/*
767+
* Easy if it's a fixed-width type
768+
*/
769+
if (typlen > 0)
770+
return typlen;
771+
/*
772+
* type_maximum_size knows the encoding of typmod for some datatypes;
773+
* don't duplicate that knowledge here.
774+
*/
775+
maxwidth = type_maximum_size(typid, typmod);
776+
if (maxwidth > 0)
777+
{
778+
/*
779+
* For BPCHAR, the max width is also the only width. Otherwise
780+
* we need to guess about the typical data width given the max.
781+
* A sliding scale for percentage of max width seems reasonable.
782+
*/
783+
if (typid == BPCHAROID)
784+
return maxwidth;
785+
if (maxwidth <= 32)
786+
return maxwidth; /* assume full width */
787+
if (maxwidth < 1000)
788+
return 32 + (maxwidth - 32) / 2; /* assume 50% */
789+
/*
790+
* Beyond 1000, assume we're looking at something like
791+
* "varchar(10000)" where the limit isn't actually reached often,
792+
* and use a fixed estimate.
793+
*/
794+
return 32 + (1000 - 32) / 2;
795+
}
796+
/*
797+
* Ooops, we have no idea ... wild guess time.
798+
*/
799+
return 32;
800+
}
801+
752802
/*
753803
* get_typtype
754804
*
@@ -782,6 +832,32 @@ get_typtype(Oid typid)
782832

783833
/* ---------- STATISTICS CACHE ---------- */
784834

835+
/*
836+
* get_attavgwidth
837+
*
838+
* Given the table and attribute number of a column, get the average
839+
* width of entries in the column. Return zero if no data available.
840+
*/
841+
int32
842+
get_attavgwidth(Oid relid, AttrNumber attnum)
843+
{
844+
HeapTuple tp;
845+
846+
tp = SearchSysCache(STATRELATT,
847+
ObjectIdGetDatum(relid),
848+
Int16GetDatum(attnum),
849+
0, 0);
850+
if (HeapTupleIsValid(tp))
851+
{
852+
int32 stawidth = ((Form_pg_statistic) GETSTRUCT(tp))->stawidth;
853+
854+
ReleaseSysCache(tp);
855+
if (stawidth > 0)
856+
return stawidth;
857+
}
858+
return 0;
859+
}
860+
785861
/*
786862
* get_attstatsslot
787863
*

src/include/utils/lsyscache.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
9-
* $Id: lsyscache.h,v 1.31 2001/05/07 00:43:26 tgl Exp $
9+
* $Id: lsyscache.h,v 1.32 2001/05/09 00:35:09 tgl Exp $
1010
*
1111
*-------------------------------------------------------------------------
1212
*/
@@ -39,6 +39,8 @@ extern bool get_typbyval(Oid typid);
3939
extern void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval);
4040
extern char get_typstorage(Oid typid);
4141
extern Datum get_typdefault(Oid typid);
42+
extern int32 get_typavgwidth(Oid typid, int32 typmod);
43+
extern int32 get_attavgwidth(Oid relid, AttrNumber attnum);
4244
extern bool get_attstatsslot(HeapTuple statstuple,
4345
Oid atttype, int32 atttypmod,
4446
int reqkind, Oid reqop,

0 commit comments

Comments
 (0)