@@ -224,6 +224,10 @@ typedef struct
224
224
* of aliases to columns of the right input. Thus, positions in the printable
225
225
* column alias list are not necessarily one-for-one with varattnos of the
226
226
* JOIN, so we need a separate new_colnames[] array for printing purposes.
227
+ *
228
+ * Finally, when dealing with wide tables we risk O(N^2) costs in assigning
229
+ * non-duplicate column names. We ameliorate that by using a hash table that
230
+ * holds all the strings appearing in colnames, new_colnames, and parentUsing.
227
231
*/
228
232
typedef struct
229
233
{
@@ -291,6 +295,15 @@ typedef struct
291
295
int * leftattnos ; /* left-child varattnos of join cols, or 0 */
292
296
int * rightattnos ; /* right-child varattnos of join cols, or 0 */
293
297
List * usingNames ; /* names assigned to merged columns */
298
+
299
+ /*
300
+ * Hash table holding copies of all the strings appearing in this struct's
301
+ * colnames, new_colnames, and parentUsing. We use a hash table only for
302
+ * sufficiently wide relations, and only during the colname-assignment
303
+ * functions set_relation_column_names and set_join_column_names;
304
+ * otherwise, names_hash is NULL.
305
+ */
306
+ HTAB * names_hash ; /* entries are just strings */
294
307
} deparse_columns ;
295
308
296
309
/* This macro is analogous to rt_fetch(), but for deparse_columns structs */
@@ -376,6 +389,9 @@ static bool colname_is_unique(const char *colname, deparse_namespace *dpns,
376
389
static char * make_colname_unique (char * colname , deparse_namespace * dpns ,
377
390
deparse_columns * colinfo );
378
391
static void expand_colnames_array_to (deparse_columns * colinfo , int n );
392
+ static void build_colinfo_names_hash (deparse_columns * colinfo );
393
+ static void add_to_names_hash (deparse_columns * colinfo , const char * name );
394
+ static void destroy_colinfo_names_hash (deparse_columns * colinfo );
379
395
static void identify_join_columns (JoinExpr * j , RangeTblEntry * jrte ,
380
396
deparse_columns * colinfo );
381
397
static char * get_rtable_name (int rtindex , deparse_context * context );
@@ -4133,6 +4149,10 @@ has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode)
4133
4149
*
4134
4150
* parentUsing is a list of all USING aliases assigned in parent joins of
4135
4151
* the current jointree node. (The passed-in list must not be modified.)
4152
+ *
4153
+ * Note that we do not use per-deparse_columns hash tables in this function.
4154
+ * The number of names that need to be assigned should be small enough that
4155
+ * we don't need to trouble with that.
4136
4156
*/
4137
4157
static void
4138
4158
set_using_names (deparse_namespace * dpns , Node * jtnode , List * parentUsing )
@@ -4408,6 +4428,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
4408
4428
colinfo -> new_colnames = (char * * ) palloc (ncolumns * sizeof (char * ));
4409
4429
colinfo -> is_new_col = (bool * ) palloc (ncolumns * sizeof (bool ));
4410
4430
4431
+ /* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
4432
+ build_colinfo_names_hash (colinfo );
4433
+
4411
4434
/*
4412
4435
* Scan the columns, select a unique alias for each one, and store it in
4413
4436
* colinfo->colnames and colinfo->new_colnames. The former array has NULL
@@ -4443,6 +4466,7 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
4443
4466
colname = make_colname_unique (colname , dpns , colinfo );
4444
4467
4445
4468
colinfo -> colnames [i ] = colname ;
4469
+ add_to_names_hash (colinfo , colname );
4446
4470
}
4447
4471
4448
4472
/* Put names of non-dropped columns in new_colnames[] too */
@@ -4456,6 +4480,9 @@ set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
4456
4480
changed_any = true;
4457
4481
}
4458
4482
4483
+ /* We're now done needing the colinfo's names_hash */
4484
+ destroy_colinfo_names_hash (colinfo );
4485
+
4459
4486
/*
4460
4487
* Set correct length for new_colnames[] array. (Note: if columns have
4461
4488
* been added, colinfo->num_cols includes them, which is not really quite
@@ -4526,6 +4553,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
4526
4553
expand_colnames_array_to (colinfo , noldcolumns );
4527
4554
Assert (colinfo -> num_cols == noldcolumns );
4528
4555
4556
+ /* If the RTE is wide enough, use a hash table to avoid O(N^2) costs */
4557
+ build_colinfo_names_hash (colinfo );
4558
+
4529
4559
/*
4530
4560
* Scan the join output columns, select an alias for each one, and store
4531
4561
* it in colinfo->colnames. If there are USING columns, set_using_names()
@@ -4563,6 +4593,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
4563
4593
if (rte -> alias == NULL )
4564
4594
{
4565
4595
colinfo -> colnames [i ] = real_colname ;
4596
+ add_to_names_hash (colinfo , real_colname );
4566
4597
continue ;
4567
4598
}
4568
4599
@@ -4579,6 +4610,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
4579
4610
colname = make_colname_unique (colname , dpns , colinfo );
4580
4611
4581
4612
colinfo -> colnames [i ] = colname ;
4613
+ add_to_names_hash (colinfo , colname );
4582
4614
}
4583
4615
4584
4616
/* Remember if any assigned aliases differ from "real" name */
@@ -4677,6 +4709,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
4677
4709
}
4678
4710
else
4679
4711
colinfo -> new_colnames [j ] = child_colname ;
4712
+ add_to_names_hash (colinfo , colinfo -> new_colnames [j ]);
4680
4713
}
4681
4714
4682
4715
colinfo -> is_new_col [j ] = leftcolinfo -> is_new_col [jc ];
@@ -4726,6 +4759,7 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
4726
4759
}
4727
4760
else
4728
4761
colinfo -> new_colnames [j ] = child_colname ;
4762
+ add_to_names_hash (colinfo , colinfo -> new_colnames [j ]);
4729
4763
}
4730
4764
4731
4765
colinfo -> is_new_col [j ] = rightcolinfo -> is_new_col [jc ];
@@ -4740,6 +4774,9 @@ set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
4740
4774
Assert (j == nnewcolumns );
4741
4775
#endif
4742
4776
4777
+ /* We're now done needing the colinfo's names_hash */
4778
+ destroy_colinfo_names_hash (colinfo );
4779
+
4743
4780
/*
4744
4781
* For a named join, print column aliases if we changed any from the child
4745
4782
* names. Unnamed joins cannot print aliases.
@@ -4762,38 +4799,59 @@ colname_is_unique(const char *colname, deparse_namespace *dpns,
4762
4799
int i ;
4763
4800
ListCell * lc ;
4764
4801
4765
- /* Check against already-assigned column aliases within RTE */
4766
- for (i = 0 ; i < colinfo -> num_cols ; i ++ )
4767
- {
4768
- char * oldname = colinfo -> colnames [i ];
4769
-
4770
- if (oldname && strcmp (oldname , colname ) == 0 )
4771
- return false;
4772
- }
4773
-
4774
4802
/*
4775
- * If we're building a new_colnames array, check that too (this will be
4776
- * partially but not completely redundant with the previous checks)
4803
+ * If we have a hash table, consult that instead of linearly scanning the
4804
+ * colinfo's strings.
4777
4805
*/
4778
- for ( i = 0 ; i < colinfo -> num_new_cols ; i ++ )
4806
+ if ( colinfo -> names_hash )
4779
4807
{
4780
- char * oldname = colinfo -> new_colnames [i ];
4781
-
4782
- if (oldname && strcmp (oldname , colname ) == 0 )
4808
+ if (hash_search (colinfo -> names_hash ,
4809
+ colname ,
4810
+ HASH_FIND ,
4811
+ NULL ) != NULL )
4783
4812
return false;
4784
4813
}
4785
-
4786
- /* Also check against USING-column names that must be globally unique */
4787
- foreach (lc , dpns -> using_names )
4814
+ else
4788
4815
{
4789
- char * oldname = (char * ) lfirst (lc );
4816
+ /* Check against already-assigned column aliases within RTE */
4817
+ for (i = 0 ; i < colinfo -> num_cols ; i ++ )
4818
+ {
4819
+ char * oldname = colinfo -> colnames [i ];
4790
4820
4791
- if (strcmp (oldname , colname ) == 0 )
4792
- return false;
4821
+ if (oldname && strcmp (oldname , colname ) == 0 )
4822
+ return false;
4823
+ }
4824
+
4825
+ /*
4826
+ * If we're building a new_colnames array, check that too (this will
4827
+ * be partially but not completely redundant with the previous checks)
4828
+ */
4829
+ for (i = 0 ; i < colinfo -> num_new_cols ; i ++ )
4830
+ {
4831
+ char * oldname = colinfo -> new_colnames [i ];
4832
+
4833
+ if (oldname && strcmp (oldname , colname ) == 0 )
4834
+ return false;
4835
+ }
4836
+
4837
+ /*
4838
+ * Also check against names already assigned for parent-join USING
4839
+ * cols
4840
+ */
4841
+ foreach (lc , colinfo -> parentUsing )
4842
+ {
4843
+ char * oldname = (char * ) lfirst (lc );
4844
+
4845
+ if (strcmp (oldname , colname ) == 0 )
4846
+ return false;
4847
+ }
4793
4848
}
4794
4849
4795
- /* Also check against names already assigned for parent-join USING cols */
4796
- foreach (lc , colinfo -> parentUsing )
4850
+ /*
4851
+ * Also check against USING-column names that must be globally unique.
4852
+ * These are not hashed, but there should be few of them.
4853
+ */
4854
+ foreach (lc , dpns -> using_names )
4797
4855
{
4798
4856
char * oldname = (char * ) lfirst (lc );
4799
4857
@@ -4861,6 +4919,90 @@ expand_colnames_array_to(deparse_columns *colinfo, int n)
4861
4919
}
4862
4920
}
4863
4921
4922
+ /*
4923
+ * build_colinfo_names_hash: optionally construct a hash table for colinfo
4924
+ */
4925
+ static void
4926
+ build_colinfo_names_hash (deparse_columns * colinfo )
4927
+ {
4928
+ HASHCTL hash_ctl ;
4929
+ int i ;
4930
+ ListCell * lc ;
4931
+
4932
+ /*
4933
+ * Use a hash table only for RTEs with at least 32 columns. (The cutoff
4934
+ * is somewhat arbitrary, but let's choose it so that this code does get
4935
+ * exercised in the regression tests.)
4936
+ */
4937
+ if (colinfo -> num_cols < 32 )
4938
+ return ;
4939
+
4940
+ /*
4941
+ * Set up the hash table. The entries are just strings with no other
4942
+ * payload.
4943
+ */
4944
+ hash_ctl .keysize = NAMEDATALEN ;
4945
+ hash_ctl .entrysize = NAMEDATALEN ;
4946
+ hash_ctl .hcxt = CurrentMemoryContext ;
4947
+ colinfo -> names_hash = hash_create ("deparse_columns names" ,
4948
+ colinfo -> num_cols + colinfo -> num_new_cols ,
4949
+ & hash_ctl ,
4950
+ HASH_ELEM | HASH_STRINGS | HASH_CONTEXT );
4951
+
4952
+ /*
4953
+ * Preload the hash table with any names already present (these would have
4954
+ * come from set_using_names).
4955
+ */
4956
+ for (i = 0 ; i < colinfo -> num_cols ; i ++ )
4957
+ {
4958
+ char * oldname = colinfo -> colnames [i ];
4959
+
4960
+ if (oldname )
4961
+ add_to_names_hash (colinfo , oldname );
4962
+ }
4963
+
4964
+ for (i = 0 ; i < colinfo -> num_new_cols ; i ++ )
4965
+ {
4966
+ char * oldname = colinfo -> new_colnames [i ];
4967
+
4968
+ if (oldname )
4969
+ add_to_names_hash (colinfo , oldname );
4970
+ }
4971
+
4972
+ foreach (lc , colinfo -> parentUsing )
4973
+ {
4974
+ char * oldname = (char * ) lfirst (lc );
4975
+
4976
+ add_to_names_hash (colinfo , oldname );
4977
+ }
4978
+ }
4979
+
4980
+ /*
4981
+ * add_to_names_hash: add a string to the names_hash, if we're using one
4982
+ */
4983
+ static void
4984
+ add_to_names_hash (deparse_columns * colinfo , const char * name )
4985
+ {
4986
+ if (colinfo -> names_hash )
4987
+ (void ) hash_search (colinfo -> names_hash ,
4988
+ name ,
4989
+ HASH_ENTER ,
4990
+ NULL );
4991
+ }
4992
+
4993
+ /*
4994
+ * destroy_colinfo_names_hash: destroy hash table when done with it
4995
+ */
4996
+ static void
4997
+ destroy_colinfo_names_hash (deparse_columns * colinfo )
4998
+ {
4999
+ if (colinfo -> names_hash )
5000
+ {
5001
+ hash_destroy (colinfo -> names_hash );
5002
+ colinfo -> names_hash = NULL ;
5003
+ }
5004
+ }
5005
+
4864
5006
/*
4865
5007
* identify_join_columns: figure out where columns of a join come from
4866
5008
*
0 commit comments