Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7c53a80

Browse files
committed
Fix hash partition pruning with asymmetric partition sets.
perform_pruning_combine_step() was not taught about the number of partition indexes used in hash partitioning; more embarrassingly, get_matching_hash_bounds() also had it wrong. These errors are masked in the common case where all the partitions have the same modulus and no partition is missing. However, with missing or unequal-size partitions, we could erroneously prune some partitions that need to be scanned, leading to silently wrong query answers. While a minimal-footprint fix for this could be to export get_partition_bound_num_indexes and make the incorrect functions use it, I'm of the opinion that that function should never have existed in the first place. It's not reasonable data structure design that PartitionBoundInfoData lacks any explicit record of the length of its indexes[] array. Perhaps that was all right when it could always be assumed equal to ndatums, but something should have been done about it as soon as that stopped being true. Putting in an explicit "nindexes" field makes both partition_bounds_equal() and partition_bounds_copy() simpler, safer, and faster than before, and removes explicit knowledge of the number-of-partition-indexes rules from some other places too. This change also makes get_hash_partition_greatest_modulus obsolete. I left that in place in case any external code uses it, but no core code does anymore. Per bug #16840 from Michał Albrycht. Back-patch to v11 where the hash partitioning code came in. (In the back branches, add the new field at the end of PartitionBoundInfoData to minimize ABI risks.) Discussion: https://postgr.es/m/16840-571a22976f829ad4@postgresql.org
1 parent 25f9a72 commit 7c53a80

File tree

6 files changed

+123
-141
lines changed

6 files changed

+123
-141
lines changed

src/backend/executor/execPartition.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,16 +1323,14 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
13231323
{
13241324
case PARTITION_STRATEGY_HASH:
13251325
{
1326-
int greatest_modulus;
13271326
uint64 rowHash;
13281327

1329-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
13301328
rowHash = compute_partition_hash_value(key->partnatts,
13311329
key->partsupfunc,
13321330
key->partcollation,
13331331
values, isnull);
13341332

1335-
part_index = boundinfo->indexes[rowHash % greatest_modulus];
1333+
part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
13361334
}
13371335
break;
13381336

src/backend/partitioning/partbounds.c

Lines changed: 30 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
9292
Oid *partcollation,
9393
PartitionBoundInfo boundinfo,
9494
PartitionRangeBound *probe, bool *is_equal);
95-
static int get_partition_bound_num_indexes(PartitionBoundInfo b);
9695
static Expr *make_partition_op_expr(PartitionKey key, int keynum,
9796
uint16 strategy, Expr *arg1, Expr *arg2);
9897
static Oid get_partition_operator(PartitionKey key, int col,
@@ -266,6 +265,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,
266265

267266
boundinfo->ndatums = ndatums;
268267
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
268+
boundinfo->nindexes = greatest_modulus;
269269
boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
270270
for (i = 0; i < greatest_modulus; i++)
271271
boundinfo->indexes[i] = -1;
@@ -398,6 +398,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
398398

399399
boundinfo->ndatums = ndatums;
400400
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
401+
boundinfo->nindexes = ndatums;
401402
boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
402403

403404
/*
@@ -593,8 +594,9 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,
593594

594595
/*
595596
* For range partitioning, an additional value of -1 is stored as the last
596-
* element.
597+
* element of the indexes[] array.
597598
*/
599+
boundinfo->nindexes = ndatums + 1;
598600
boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int));
599601

600602
for (i = 0; i < ndatums; i++)
@@ -675,45 +677,41 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
675677
if (b1->ndatums != b2->ndatums)
676678
return false;
677679

680+
if (b1->nindexes != b2->nindexes)
681+
return false;
682+
678683
if (b1->null_index != b2->null_index)
679684
return false;
680685

681686
if (b1->default_index != b2->default_index)
682687
return false;
683688

684-
if (b1->strategy == PARTITION_STRATEGY_HASH)
689+
/* For all partition strategies, the indexes[] arrays have to match */
690+
for (i = 0; i < b1->nindexes; i++)
685691
{
686-
int greatest_modulus = get_hash_partition_greatest_modulus(b1);
687-
688-
/*
689-
* If two hash partitioned tables have different greatest moduli,
690-
* their partition schemes don't match.
691-
*/
692-
if (greatest_modulus != get_hash_partition_greatest_modulus(b2))
692+
if (b1->indexes[i] != b2->indexes[i])
693693
return false;
694+
}
694695

696+
/* Finally, compare the datums[] arrays */
697+
if (b1->strategy == PARTITION_STRATEGY_HASH)
698+
{
695699
/*
696700
* We arrange the partitions in the ascending order of their moduli
697701
* and remainders. Also every modulus is factor of next larger
698702
* modulus. Therefore we can safely store index of a given partition
699703
* in indexes array at remainder of that partition. Also entries at
700704
* (remainder + N * modulus) positions in indexes array are all same
701-
* for (modulus, remainder) specification for any partition. Thus
702-
* datums array from both the given bounds are same, if and only if
703-
* their indexes array will be same. So, it suffices to compare
704-
* indexes array.
705-
*/
706-
for (i = 0; i < greatest_modulus; i++)
707-
if (b1->indexes[i] != b2->indexes[i])
708-
return false;
709-
710-
#ifdef USE_ASSERT_CHECKING
711-
712-
/*
713-
* Nonetheless make sure that the bounds are indeed same when the
705+
* for (modulus, remainder) specification for any partition. Thus the
706+
* datums arrays from the given bounds are the same, if and only if
707+
* their indexes arrays are the same. So, it suffices to compare the
708+
* indexes arrays.
709+
*
710+
* Nonetheless make sure that the bounds are indeed the same when the
714711
* indexes match. Hash partition bound stores modulus and remainder
715712
* at b1->datums[i][0] and b1->datums[i][1] position respectively.
716713
*/
714+
#ifdef USE_ASSERT_CHECKING
717715
for (i = 0; i < b1->ndatums; i++)
718716
Assert((b1->datums[i][0] == b2->datums[i][0] &&
719717
b1->datums[i][1] == b2->datums[i][1]));
@@ -759,15 +757,7 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
759757
parttypbyval[j], parttyplen[j]))
760758
return false;
761759
}
762-
763-
if (b1->indexes[i] != b2->indexes[i])
764-
return false;
765760
}
766-
767-
/* There are ndatums+1 indexes in case of range partitions */
768-
if (b1->strategy == PARTITION_STRATEGY_RANGE &&
769-
b1->indexes[i] != b2->indexes[i])
770-
return false;
771761
}
772762
return true;
773763
}
@@ -783,17 +773,16 @@ partition_bounds_copy(PartitionBoundInfo src,
783773
PartitionBoundInfo dest;
784774
int i;
785775
int ndatums;
776+
int nindexes;
786777
int partnatts;
787-
int num_indexes;
788778

789779
dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData));
790780

791781
dest->strategy = src->strategy;
792782
ndatums = dest->ndatums = src->ndatums;
783+
nindexes = dest->nindexes = src->nindexes;
793784
partnatts = key->partnatts;
794785

795-
num_indexes = get_partition_bound_num_indexes(src);
796-
797786
/* List partitioned tables have only a single partition key. */
798787
Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);
799788

@@ -851,8 +840,8 @@ partition_bounds_copy(PartitionBoundInfo src,
851840
}
852841
}
853842

854-
dest->indexes = (int *) palloc(sizeof(int) * num_indexes);
855-
memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes);
843+
dest->indexes = (int *) palloc(sizeof(int) * nindexes);
844+
memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes);
856845

857846
dest->null_index = src->null_index;
858847
dest->default_index = src->default_index;
@@ -1016,7 +1005,7 @@ check_new_partition_bound(char *relname, Relation parent,
10161005
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
10171006
errmsg("every hash partition modulus must be a factor of the next larger modulus")));
10181007

1019-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
1008+
greatest_modulus = boundinfo->nindexes;
10201009
remainder = spec->remainder;
10211010

10221011
/*
@@ -1380,18 +1369,15 @@ check_default_partition_contents(Relation parent, Relation default_rel,
13801369
/*
13811370
* get_hash_partition_greatest_modulus
13821371
*
1383-
* Returns the greatest modulus of the hash partition bound. The greatest
1384-
* modulus will be at the end of the datums array because hash partitions are
1385-
* arranged in the ascending order of their moduli and remainders.
1372+
* Returns the greatest modulus of the hash partition bound.
1373+
* This is no longer used in the core code, but we keep it around
1374+
* in case external modules are using it.
13861375
*/
13871376
int
13881377
get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
13891378
{
13901379
Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
1391-
Assert(bound->datums && bound->ndatums > 0);
1392-
Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0);
1393-
1394-
return DatumGetInt32(bound->datums[bound->ndatums - 1][0]);
1380+
return bound->nindexes;
13951381
}
13961382

13971383
/*
@@ -1788,46 +1774,6 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
17881774
b1->lower, b2);
17891775
}
17901776

1791-
/*
1792-
* get_partition_bound_num_indexes
1793-
*
1794-
* Returns the number of the entries in the partition bound indexes array.
1795-
*/
1796-
static int
1797-
get_partition_bound_num_indexes(PartitionBoundInfo bound)
1798-
{
1799-
int num_indexes;
1800-
1801-
Assert(bound);
1802-
1803-
switch (bound->strategy)
1804-
{
1805-
case PARTITION_STRATEGY_HASH:
1806-
1807-
/*
1808-
* The number of the entries in the indexes array is same as the
1809-
* greatest modulus.
1810-
*/
1811-
num_indexes = get_hash_partition_greatest_modulus(bound);
1812-
break;
1813-
1814-
case PARTITION_STRATEGY_LIST:
1815-
num_indexes = bound->ndatums;
1816-
break;
1817-
1818-
case PARTITION_STRATEGY_RANGE:
1819-
/* Range partitioned table has an extra index. */
1820-
num_indexes = bound->ndatums + 1;
1821-
break;
1822-
1823-
default:
1824-
elog(ERROR, "unexpected partition strategy: %d",
1825-
(int) bound->strategy);
1826-
}
1827-
1828-
return num_indexes;
1829-
}
1830-
18311777
/*
18321778
* get_partition_operator
18331779
*

src/backend/partitioning/partprune.c

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -789,7 +789,10 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
789789
scan_default = final_result->scan_default;
790790
while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
791791
{
792-
int partindex = context->boundinfo->indexes[i];
792+
int partindex;
793+
794+
Assert(i < context->boundinfo->nindexes);
795+
partindex = context->boundinfo->indexes[i];
793796

794797
if (partindex < 0)
795798
{
@@ -2518,20 +2521,19 @@ get_matching_hash_bounds(PartitionPruneContext *context,
25182521
for (i = 0; i < partnatts; i++)
25192522
isnull[i] = bms_is_member(i, nullkeys);
25202523

2521-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
25222524
rowHash = compute_partition_hash_value(partnatts, partsupfunc, partcollation,
25232525
values, isnull);
25242526

2527+
greatest_modulus = boundinfo->nindexes;
25252528
if (partindices[rowHash % greatest_modulus] >= 0)
25262529
result->bound_offsets =
25272530
bms_make_singleton(rowHash % greatest_modulus);
25282531
}
25292532
else
25302533
{
2531-
/* Getting here means at least one hash partition exists. */
2532-
Assert(boundinfo->ndatums > 0);
2534+
/* Report all valid offsets into the boundinfo->indexes array. */
25332535
result->bound_offsets = bms_add_range(NULL, 0,
2534-
boundinfo->ndatums - 1);
2536+
boundinfo->nindexes - 1);
25352537
}
25362538

25372539
/*
@@ -3392,30 +3394,20 @@ perform_pruning_combine_step(PartitionPruneContext *context,
33923394
PartitionPruneStepCombine *cstep,
33933395
PruneStepResult **step_results)
33943396
{
3395-
ListCell *lc1;
3396-
PruneStepResult *result = NULL;
3397+
PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
33973398
bool firststep;
3399+
ListCell *lc1;
33983400

33993401
/*
34003402
* A combine step without any source steps is an indication to not perform
34013403
* any partition pruning. Return all datum indexes in that case.
34023404
*/
3403-
result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
3404-
if (list_length(cstep->source_stepids) == 0)
3405+
if (cstep->source_stepids == NIL)
34053406
{
34063407
PartitionBoundInfo boundinfo = context->boundinfo;
3407-
int rangemax;
3408-
3409-
/*
3410-
* Add all valid offsets into the boundinfo->indexes array. For range
3411-
* partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
3412-
* valid entries; otherwise there are boundinfo->ndatums.
3413-
*/
3414-
rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
3415-
boundinfo->ndatums : boundinfo->ndatums - 1;
34163408

34173409
result->bound_offsets =
3418-
bms_add_range(result->bound_offsets, 0, rangemax);
3410+
bms_add_range(NULL, 0, boundinfo->nindexes - 1);
34193411
result->scan_default = partition_bound_has_default(boundinfo);
34203412
result->scan_null = partition_bound_accepts_nulls(boundinfo);
34213413
return result;

src/include/partitioning/partbounds.h

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
* In the case of range partitioning, ndatums will typically be far less than
3131
* 2 * nparts, because a partition's upper bound and the next partition's lower
3232
* bound are the same in most common cases, and we only store one of them (the
33-
* upper bound). In case of hash partitioning, ndatums will be same as the
33+
* upper bound). In case of hash partitioning, ndatums will be the same as the
3434
* number of partitions.
3535
*
3636
* For range and list partitioned tables, datums is an array of datum-tuples
@@ -46,20 +46,26 @@
4646
* the partition key's operator classes and collations.
4747
*
4848
* In the case of list partitioning, the indexes array stores one entry for
49-
* every datum, which is the index of the partition that accepts a given datum.
50-
* In case of range partitioning, it stores one entry per distinct range
51-
* datum, which is the index of the partition for which a given datum
52-
* is an upper bound. In the case of hash partitioning, the number of the
53-
* entries in the indexes array is same as the greatest modulus amongst all
54-
* partitions. For a given partition key datum-tuple, the index of the
55-
* partition which would accept that datum-tuple would be given by the entry
56-
* pointed by remainder produced when hash value of the datum-tuple is divided
57-
* by the greatest modulus.
49+
* each datum-array entry, which is the index of the partition that accepts
50+
* rows matching that datum. So nindexes == ndatums.
51+
*
52+
* In the case of range partitioning, the indexes array stores one entry per
53+
* distinct range datum, which is the index of the partition for which that
54+
* datum is an upper bound (or -1 for a "gap" that has no partition). It is
55+
* convenient to have an extra -1 entry representing values above the last
56+
* range datum, so nindexes == ndatums + 1.
57+
*
58+
* In the case of hash partitioning, the number of entries in the indexes
59+
* array is the same as the greatest modulus amongst all partitions (which
60+
* is a multiple of all partition moduli), so nindexes == greatest modulus.
61+
* The indexes array is indexed according to the hash key's remainder modulo
62+
* the greatest modulus, and it contains either the partition index accepting
63+
* that remainder, or -1 if there is no partition for that remainder.
5864
*/
5965
typedef struct PartitionBoundInfoData
6066
{
6167
char strategy; /* hash, list or range? */
62-
int ndatums; /* Length of the datums following array */
68+
int ndatums; /* Length of the datums[] array */
6369
Datum **datums;
6470
PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
6571
* NULL for hash and list partitioned
@@ -69,6 +75,7 @@ typedef struct PartitionBoundInfoData
6975
* if there isn't one */
7076
int default_index; /* Index of the default partition; -1 if there
7177
* isn't one */
78+
int nindexes; /* Length of the indexes[] array */
7279
} PartitionBoundInfoData;
7380

7481
#define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)

0 commit comments

Comments
 (0)