Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 52c7074

Browse files
committed
Use a hash table to de-duplicate column names in ruleutils.c.
Commit 8004953 added a hash table to avoid O(N^2) cost in choosing unique relation aliases while deparsing a view or rule. It did nothing about the similar O(N^2) (maybe worse) costs of choosing unique column aliases within each RTE. However, that's now demonstrably a bottleneck when deparsing CHECK constraints for wide tables, so let's use a similar hash table to handle those. The extra cost of setting up the hash table will not be repaid unless the table has many columns. I've set this up so that we use the brute force method if there are less than 32 columns. The exact cutoff is not too critical, but this value seems good because it results in both code paths getting exercised by existing regression-test cases. Patch by me; thanks to David Rowley for review. Discussion: https://postgr.es/m/2885468.1722291250@sss.pgh.pa.us
1 parent bccca78 commit 52c7074

File tree

1 file changed

+165
-23
lines changed

1 file changed

+165
-23
lines changed

src/backend/utils/adt/ruleutils.c

Lines changed: 165 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,10 @@ typedef struct
224224
* of aliases to columns of the right input. Thus, positions in the printable
225225
* column alias list are not necessarily one-for-one with varattnos of the
226226
* JOIN, so we need a separate new_colnames[] array for printing purposes.
227+
*
228+
* Finally, when dealing with wide tables we risk O(N^2) costs in assigning
229+
* non-duplicate column names. We ameliorate that by using a hash table that
230+
* holds all the strings appearing in colnames, new_colnames, and parentUsing.
227231
*/
228232
typedef struct
229233
{
@@ -291,6 +295,15 @@ typedef struct
291295
int *leftattnos; /* left-child varattnos of join cols, or 0 */
292296
int *rightattnos; /* right-child varattnos of join cols, or 0 */
293297
List *usingNames; /* names assigned to merged columns */
298+
299+
/*
300+
* Hash table holding copies of all the strings appearing in this struct's
301+
* colnames, new_colnames, and parentUsing. We use a hash table only for
302+
* sufficiently wide relations, and only during the colname-assignment
303+
* functions set_relation_column_names and set_join_column_names;
304+
* otherwise, names_hash is NULL.
305+
*/
306+
HTAB *names_hash; /* entries are just strings */
294307
} deparse_columns;
295308

296309
/* This macro is analogous to rt_fetch(), but for deparse_columns structs */
@@ -376,6 +389,9 @@ static bool colname_is_unique(const char *colname, deparse_namespace *dpns,
376389
static char *make_colname_unique(char *colname, deparse_namespace *dpns,
377390
deparse_columns *colinfo);
378391
static void expand_colnames_array_to(deparse_columns *colinfo, int n);
392+
static void build_colinfo_names_hash(deparse_columns *colinfo);
393+
static void add_to_names_hash(deparse_columns *colinfo, const char *name);
394+
static void destroy_colinfo_names_hash(deparse_columns *colinfo);
379395
static void identify_join_columns(JoinExpr *j, RangeTblEntry *jrte,
380396
deparse_columns *colinfo);
381397
static char *get_rtable_name(int rtindex, deparse_context *context);
@@ -4133,6 +4149,10 @@ has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode)
41334149
*
41344150
* parentUsing is a list of all USING aliases assigned in parent joins of
41354151
* the current jointree node. (The passed-in list must not be modified.)
4152+
*
4153+
* Note that we do not use per-deparse_columns hash tables in this function.
4154+
* The number of names that need to be assigned should be small enough that
4155+
* we don't need to trouble with that.
41364156
*/
41374157
static void
41384158
set_using_names(deparse_namespace *dpns, Node *jtnode, List *parentUsing)
@@ -4408,6 +4428,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
44084428
colinfo->new_colnames = (char **) palloc(ncolumns * sizeof(char *));
44094429
colinfo->is_new_col = (bool *) palloc(ncolumns * sizeof(bool));
44104430

4431+
/* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
4432+
build_colinfo_names_hash(colinfo);
4433+
44114434
/*
44124435
* Scan the columns, select a unique alias for each one, and store it in
44134436
* colinfo->colnames and colinfo->new_colnames. The former array has NULL
@@ -4443,6 +4466,7 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
44434466
colname = make_colname_unique(colname, dpns, colinfo);
44444467

44454468
colinfo->colnames[i] = colname;
4469+
add_to_names_hash(colinfo, colname);
44464470
}
44474471

44484472
/* Put names of non-dropped columns in new_colnames[] too */
@@ -4456,6 +4480,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
44564480
changed_any = true;
44574481
}
44584482

4483+
/* We're now done needing the colinfo's names_hash */
4484+
destroy_colinfo_names_hash(colinfo);
4485+
44594486
/*
44604487
* Set correct length for new_colnames[] array. (Note: if columns have
44614488
* been added, colinfo->num_cols includes them, which is not really quite
@@ -4526,6 +4553,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
45264553
expand_colnames_array_to(colinfo, noldcolumns);
45274554
Assert(colinfo->num_cols == noldcolumns);
45284555

4556+
/* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
4557+
build_colinfo_names_hash(colinfo);
4558+
45294559
/*
45304560
* Scan the join output columns, select an alias for each one, and store
45314561
* it in colinfo->colnames. If there are USING columns, set_using_names()
@@ -4563,6 +4593,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
45634593
if (rte->alias == NULL)
45644594
{
45654595
colinfo->colnames[i] = real_colname;
4596+
add_to_names_hash(colinfo, real_colname);
45664597
continue;
45674598
}
45684599

@@ -4579,6 +4610,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
45794610
colname = make_colname_unique(colname, dpns, colinfo);
45804611

45814612
colinfo->colnames[i] = colname;
4613+
add_to_names_hash(colinfo, colname);
45824614
}
45834615

45844616
/* Remember if any assigned aliases differ from "real" name */
@@ -4677,6 +4709,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
46774709
}
46784710
else
46794711
colinfo->new_colnames[j] = child_colname;
4712+
add_to_names_hash(colinfo, colinfo->new_colnames[j]);
46804713
}
46814714

46824715
colinfo->is_new_col[j] = leftcolinfo->is_new_col[jc];
@@ -4726,6 +4759,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
47264759
}
47274760
else
47284761
colinfo->new_colnames[j] = child_colname;
4762+
add_to_names_hash(colinfo, colinfo->new_colnames[j]);
47294763
}
47304764

47314765
colinfo->is_new_col[j] = rightcolinfo->is_new_col[jc];
@@ -4740,6 +4774,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
47404774
Assert(j == nnewcolumns);
47414775
#endif
47424776

4777+
/* We're now done needing the colinfo's names_hash */
4778+
destroy_colinfo_names_hash(colinfo);
4779+
47434780
/*
47444781
* For a named join, print column aliases if we changed any from the child
47454782
* names. Unnamed joins cannot print aliases.
@@ -4762,38 +4799,59 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
47624799
int i;
47634800
ListCell *lc;
47644801

4765-
/* Check against already-assigned column aliases within RTE */
4766-
for (i = 0; i < colinfo->num_cols; i++)
4767-
{
4768-
char *oldname = colinfo->colnames[i];
4769-
4770-
if (oldname && strcmp(oldname, colname) == 0)
4771-
return false;
4772-
}
4773-
47744802
/*
4775-
* If we're building a new_colnames array, check that too (this will be
4776-
* partially but not completely redundant with the previous checks)
4803+
* If we have a hash table, consult that instead of linearly scanning the
4804+
* colinfo's strings.
47774805
*/
4778-
for (i = 0; i < colinfo->num_new_cols; i++)
4806+
if (colinfo->names_hash)
47794807
{
4780-
char *oldname = colinfo->new_colnames[i];
4781-
4782-
if (oldname && strcmp(oldname, colname) == 0)
4808+
if (hash_search(colinfo->names_hash,
4809+
colname,
4810+
HASH_FIND,
4811+
NULL) != NULL)
47834812
return false;
47844813
}
4785-
4786-
/* Also check against USING-column names that must be globally unique */
4787-
foreach(lc, dpns->using_names)
4814+
else
47884815
{
4789-
char *oldname = (char *) lfirst(lc);
4816+
/* Check against already-assigned column aliases within RTE */
4817+
for (i = 0; i < colinfo->num_cols; i++)
4818+
{
4819+
char *oldname = colinfo->colnames[i];
47904820

4791-
if (strcmp(oldname, colname) == 0)
4792-
return false;
4821+
if (oldname && strcmp(oldname, colname) == 0)
4822+
return false;
4823+
}
4824+
4825+
/*
4826+
* If we're building a new_colnames array, check that too (this will
4827+
* be partially but not completely redundant with the previous checks)
4828+
*/
4829+
for (i = 0; i < colinfo->num_new_cols; i++)
4830+
{
4831+
char *oldname = colinfo->new_colnames[i];
4832+
4833+
if (oldname && strcmp(oldname, colname) == 0)
4834+
return false;
4835+
}
4836+
4837+
/*
4838+
* Also check against names already assigned for parent-join USING
4839+
* cols
4840+
*/
4841+
foreach(lc, colinfo->parentUsing)
4842+
{
4843+
char *oldname = (char *) lfirst(lc);
4844+
4845+
if (strcmp(oldname, colname) == 0)
4846+
return false;
4847+
}
47934848
}
47944849

4795-
/* Also check against names already assigned for parent-join USING cols */
4796-
foreach(lc, colinfo->parentUsing)
4850+
/*
4851+
* Also check against USING-column names that must be globally unique.
4852+
* These are not hashed, but there should be few of them.
4853+
*/
4854+
foreach(lc, dpns->using_names)
47974855
{
47984856
char *oldname = (char *) lfirst(lc);
47994857

@@ -4861,6 +4919,90 @@ expand_colnames_array_to(deparse_columns *colinfo, int n)
48614919
}
48624920
}
48634921

4922+
/*
4923+
* build_colinfo_names_hash: optionally construct a hash table for colinfo
4924+
*/
4925+
static void
4926+
build_colinfo_names_hash(deparse_columns *colinfo)
4927+
{
4928+
HASHCTL hash_ctl;
4929+
int i;
4930+
ListCell *lc;
4931+
4932+
/*
4933+
* Use a hash table only for RTEs with at least 32 columns. (The cutoff
4934+
* is somewhat arbitrary, but let's choose it so that this code does get
4935+
* exercised in the regression tests.)
4936+
*/
4937+
if (colinfo->num_cols < 32)
4938+
return;
4939+
4940+
/*
4941+
* Set up the hash table. The entries are just strings with no other
4942+
* payload.
4943+
*/
4944+
hash_ctl.keysize = NAMEDATALEN;
4945+
hash_ctl.entrysize = NAMEDATALEN;
4946+
hash_ctl.hcxt = CurrentMemoryContext;
4947+
colinfo->names_hash = hash_create("deparse_columns names",
4948+
colinfo->num_cols + colinfo->num_new_cols,
4949+
&hash_ctl,
4950+
HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
4951+
4952+
/*
4953+
* Preload the hash table with any names already present (these would have
4954+
* come from set_using_names).
4955+
*/
4956+
for (i = 0; i < colinfo->num_cols; i++)
4957+
{
4958+
char *oldname = colinfo->colnames[i];
4959+
4960+
if (oldname)
4961+
add_to_names_hash(colinfo, oldname);
4962+
}
4963+
4964+
for (i = 0; i < colinfo->num_new_cols; i++)
4965+
{
4966+
char *oldname = colinfo->new_colnames[i];
4967+
4968+
if (oldname)
4969+
add_to_names_hash(colinfo, oldname);
4970+
}
4971+
4972+
foreach(lc, colinfo->parentUsing)
4973+
{
4974+
char *oldname = (char *) lfirst(lc);
4975+
4976+
add_to_names_hash(colinfo, oldname);
4977+
}
4978+
}
4979+
4980+
/*
4981+
* add_to_names_hash: add a string to the names_hash, if we're using one
4982+
*/
4983+
static void
4984+
add_to_names_hash(deparse_columns *colinfo, const char *name)
4985+
{
4986+
if (colinfo->names_hash)
4987+
(void) hash_search(colinfo->names_hash,
4988+
name,
4989+
HASH_ENTER,
4990+
NULL);
4991+
}
4992+
4993+
/*
4994+
* destroy_colinfo_names_hash: destroy hash table when done with it
4995+
*/
4996+
static void
4997+
destroy_colinfo_names_hash(deparse_columns *colinfo)
4998+
{
4999+
if (colinfo->names_hash)
5000+
{
5001+
hash_destroy(colinfo->names_hash);
5002+
colinfo->names_hash = NULL;
5003+
}
5004+
}
5005+
48645006
/*
48655007
* identify_join_columns: figure out where columns of a join come from
48665008
*

0 commit comments

Comments
 (0)