Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 1d9351a

Browse files
committed
Fix hash partition pruning with asymmetric partition sets.
perform_pruning_combine_step() was not taught about the number of partition indexes used in hash partitioning; more embarrassingly, get_matching_hash_bounds() also had it wrong. These errors are masked in the common case where all the partitions have the same modulus and no partition is missing. However, with missing or unequal-size partitions, we could erroneously prune some partitions that need to be scanned, leading to silently wrong query answers. While a minimal-footprint fix for this could be to export get_partition_bound_num_indexes and make the incorrect functions use it, I'm of the opinion that that function should never have existed in the first place. It's not reasonable data structure design that PartitionBoundInfoData lacks any explicit record of the length of its indexes[] array. Perhaps that was all right when it could always be assumed equal to ndatums, but something should have been done about it as soon as that stopped being true. Putting in an explicit "nindexes" field makes both partition_bounds_equal() and partition_bounds_copy() simpler, safer, and faster than before, and removes explicit knowledge of the number-of-partition-indexes rules from some other places too. This change also makes get_hash_partition_greatest_modulus obsolete. I left that in place in case any external code uses it, but no core code does anymore. Per bug #16840 from Michał Albrycht. Back-patch to v11 where the hash partitioning code came in. (In the back branches, add the new field at the end of PartitionBoundInfoData to minimize ABI risks.) Discussion: https://postgr.es/m/16840-571a22976f829ad4@postgresql.org
1 parent 1b242f4 commit 1d9351a

File tree

6 files changed

+124
-141
lines changed

6 files changed

+124
-141
lines changed

src/backend/executor/execPartition.c

+1-3
Original file line numberDiff line numberDiff line change
@@ -1323,16 +1323,14 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
13231323
{
13241324
case PARTITION_STRATEGY_HASH:
13251325
{
1326-
int greatest_modulus;
13271326
uint64 rowHash;
13281327

1329-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
13301328
rowHash = compute_partition_hash_value(key->partnatts,
13311329
key->partsupfunc,
13321330
key->partcollation,
13331331
values, isnull);
13341332

1335-
part_index = boundinfo->indexes[rowHash % greatest_modulus];
1333+
part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
13361334
}
13371335
break;
13381336

src/backend/partitioning/partbounds.c

+31-84
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,6 @@ static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
224224
Oid *partcollation,
225225
PartitionBoundInfo boundinfo,
226226
PartitionRangeBound *probe, int32 *cmpval);
227-
static int get_partition_bound_num_indexes(PartitionBoundInfo b);
228227
static Expr *make_partition_op_expr(PartitionKey key, int keynum,
229228
uint16 strategy, Expr *arg1, Expr *arg2);
230229
static Oid get_partition_operator(PartitionKey key, int col,
@@ -398,6 +397,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,
398397

399398
boundinfo->ndatums = ndatums;
400399
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
400+
boundinfo->nindexes = greatest_modulus;
401401
boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
402402
for (i = 0; i < greatest_modulus; i++)
403403
boundinfo->indexes[i] = -1;
@@ -530,6 +530,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
530530

531531
boundinfo->ndatums = ndatums;
532532
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
533+
boundinfo->nindexes = ndatums;
533534
boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
534535

535536
/*
@@ -725,8 +726,9 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,
725726

726727
/*
727728
* For range partitioning, an additional value of -1 is stored as the last
728-
* element.
729+
* element of the indexes[] array.
729730
*/
731+
boundinfo->nindexes = ndatums + 1;
730732
boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int));
731733

732734
for (i = 0; i < ndatums; i++)
@@ -807,45 +809,41 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
807809
if (b1->ndatums != b2->ndatums)
808810
return false;
809811

812+
if (b1->nindexes != b2->nindexes)
813+
return false;
814+
810815
if (b1->null_index != b2->null_index)
811816
return false;
812817

813818
if (b1->default_index != b2->default_index)
814819
return false;
815820

816-
if (b1->strategy == PARTITION_STRATEGY_HASH)
821+
/* For all partition strategies, the indexes[] arrays have to match */
822+
for (i = 0; i < b1->nindexes; i++)
817823
{
818-
int greatest_modulus = get_hash_partition_greatest_modulus(b1);
819-
820-
/*
821-
* If two hash partitioned tables have different greatest moduli,
822-
* their partition schemes don't match.
823-
*/
824-
if (greatest_modulus != get_hash_partition_greatest_modulus(b2))
824+
if (b1->indexes[i] != b2->indexes[i])
825825
return false;
826+
}
826827

828+
/* Finally, compare the datums[] arrays */
829+
if (b1->strategy == PARTITION_STRATEGY_HASH)
830+
{
827831
/*
828832
* We arrange the partitions in the ascending order of their moduli
829833
* and remainders. Also every modulus is factor of next larger
830834
* modulus. Therefore we can safely store index of a given partition
831835
* in indexes array at remainder of that partition. Also entries at
832836
* (remainder + N * modulus) positions in indexes array are all same
833-
* for (modulus, remainder) specification for any partition. Thus
834-
* datums array from both the given bounds are same, if and only if
835-
* their indexes array will be same. So, it suffices to compare
836-
* indexes array.
837-
*/
838-
for (i = 0; i < greatest_modulus; i++)
839-
if (b1->indexes[i] != b2->indexes[i])
840-
return false;
841-
842-
#ifdef USE_ASSERT_CHECKING
843-
844-
/*
845-
* Nonetheless make sure that the bounds are indeed same when the
837+
* for (modulus, remainder) specification for any partition. Thus the
838+
* datums arrays from the given bounds are the same, if and only if
839+
* their indexes arrays are the same. So, it suffices to compare the
840+
* indexes arrays.
841+
*
842+
* Nonetheless make sure that the bounds are indeed the same when the
846843
* indexes match. Hash partition bound stores modulus and remainder
847844
* at b1->datums[i][0] and b1->datums[i][1] position respectively.
848845
*/
846+
#ifdef USE_ASSERT_CHECKING
849847
for (i = 0; i < b1->ndatums; i++)
850848
Assert((b1->datums[i][0] == b2->datums[i][0] &&
851849
b1->datums[i][1] == b2->datums[i][1]));
@@ -891,15 +889,7 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
891889
parttypbyval[j], parttyplen[j]))
892890
return false;
893891
}
894-
895-
if (b1->indexes[i] != b2->indexes[i])
896-
return false;
897892
}
898-
899-
/* There are ndatums+1 indexes in case of range partitions */
900-
if (b1->strategy == PARTITION_STRATEGY_RANGE &&
901-
b1->indexes[i] != b2->indexes[i])
902-
return false;
903893
}
904894
return true;
905895
}
@@ -920,19 +910,18 @@ partition_bounds_copy(PartitionBoundInfo src,
920910
PartitionBoundInfo dest;
921911
int i;
922912
int ndatums;
913+
int nindexes;
923914
int partnatts;
924-
int num_indexes;
925915
bool hash_part;
926916
int natts;
927917

928918
dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData));
929919

930920
dest->strategy = src->strategy;
931921
ndatums = dest->ndatums = src->ndatums;
922+
nindexes = dest->nindexes = src->nindexes;
932923
partnatts = key->partnatts;
933924

934-
num_indexes = get_partition_bound_num_indexes(src);
935-
936925
/* List partitioned tables have only a single partition key. */
937926
Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);
938927

@@ -990,8 +979,8 @@ partition_bounds_copy(PartitionBoundInfo src,
990979
}
991980
}
992981

993-
dest->indexes = (int *) palloc(sizeof(int) * num_indexes);
994-
memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes);
982+
dest->indexes = (int *) palloc(sizeof(int) * nindexes);
983+
memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes);
995984

996985
dest->null_index = src->null_index;
997986
dest->default_index = src->default_index;
@@ -2456,6 +2445,7 @@ build_merged_partition_bounds(char strategy, List *merged_datums,
24562445
}
24572446

24582447
Assert(list_length(merged_indexes) == ndatums);
2448+
merged_bounds->nindexes = ndatums;
24592449
merged_bounds->indexes = (int *) palloc(sizeof(int) * ndatums);
24602450
pos = 0;
24612451
foreach(lc, merged_indexes)
@@ -2889,7 +2879,7 @@ check_new_partition_bound(char *relname, Relation parent,
28892879
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
28902880
errmsg("every hash partition modulus must be a factor of the next larger modulus")));
28912881

2892-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
2882+
greatest_modulus = boundinfo->nindexes;
28932883
remainder = spec->remainder;
28942884

28952885
/*
@@ -3282,18 +3272,15 @@ check_default_partition_contents(Relation parent, Relation default_rel,
32823272
/*
32833273
* get_hash_partition_greatest_modulus
32843274
*
3285-
* Returns the greatest modulus of the hash partition bound. The greatest
3286-
* modulus will be at the end of the datums array because hash partitions are
3287-
* arranged in the ascending order of their moduli and remainders.
3275+
* Returns the greatest modulus of the hash partition bound.
3276+
* This is no longer used in the core code, but we keep it around
3277+
* in case external modules are using it.
32883278
*/
32893279
int
32903280
get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
32913281
{
32923282
Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
3293-
Assert(bound->datums && bound->ndatums > 0);
3294-
Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0);
3295-
3296-
return DatumGetInt32(bound->datums[bound->ndatums - 1][0]);
3283+
return bound->nindexes;
32973284
}
32983285

32993286
/*
@@ -3697,46 +3684,6 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
36973684
b1, b2);
36983685
}
36993686

3700-
/*
3701-
* get_partition_bound_num_indexes
3702-
*
3703-
* Returns the number of the entries in the partition bound indexes array.
3704-
*/
3705-
static int
3706-
get_partition_bound_num_indexes(PartitionBoundInfo bound)
3707-
{
3708-
int num_indexes;
3709-
3710-
Assert(bound);
3711-
3712-
switch (bound->strategy)
3713-
{
3714-
case PARTITION_STRATEGY_HASH:
3715-
3716-
/*
3717-
* The number of the entries in the indexes array is same as the
3718-
* greatest modulus.
3719-
*/
3720-
num_indexes = get_hash_partition_greatest_modulus(bound);
3721-
break;
3722-
3723-
case PARTITION_STRATEGY_LIST:
3724-
num_indexes = bound->ndatums;
3725-
break;
3726-
3727-
case PARTITION_STRATEGY_RANGE:
3728-
/* Range partitioned table has an extra index. */
3729-
num_indexes = bound->ndatums + 1;
3730-
break;
3731-
3732-
default:
3733-
elog(ERROR, "unexpected partition strategy: %d",
3734-
(int) bound->strategy);
3735-
}
3736-
3737-
return num_indexes;
3738-
}
3739-
37403687
/*
37413688
* get_partition_operator
37423689
*

src/backend/partitioning/partprune.c

+11-19
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,10 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
781781
scan_default = final_result->scan_default;
782782
while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
783783
{
784-
int partindex = context->boundinfo->indexes[i];
784+
int partindex;
785+
786+
Assert(i < context->boundinfo->nindexes);
787+
partindex = context->boundinfo->indexes[i];
785788

786789
if (partindex < 0)
787790
{
@@ -2514,20 +2517,19 @@ get_matching_hash_bounds(PartitionPruneContext *context,
25142517
for (i = 0; i < partnatts; i++)
25152518
isnull[i] = bms_is_member(i, nullkeys);
25162519

2517-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
25182520
rowHash = compute_partition_hash_value(partnatts, partsupfunc, partcollation,
25192521
values, isnull);
25202522

2523+
greatest_modulus = boundinfo->nindexes;
25212524
if (partindices[rowHash % greatest_modulus] >= 0)
25222525
result->bound_offsets =
25232526
bms_make_singleton(rowHash % greatest_modulus);
25242527
}
25252528
else
25262529
{
2527-
/* Getting here means at least one hash partition exists. */
2528-
Assert(boundinfo->ndatums > 0);
2530+
/* Report all valid offsets into the boundinfo->indexes array. */
25292531
result->bound_offsets = bms_add_range(NULL, 0,
2530-
boundinfo->ndatums - 1);
2532+
boundinfo->nindexes - 1);
25312533
}
25322534

25332535
/*
@@ -3388,30 +3390,20 @@ perform_pruning_combine_step(PartitionPruneContext *context,
33883390
PartitionPruneStepCombine *cstep,
33893391
PruneStepResult **step_results)
33903392
{
3391-
ListCell *lc1;
3392-
PruneStepResult *result = NULL;
3393+
PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
33933394
bool firststep;
3395+
ListCell *lc1;
33943396

33953397
/*
33963398
* A combine step without any source steps is an indication to not perform
33973399
* any partition pruning. Return all datum indexes in that case.
33983400
*/
3399-
result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
3400-
if (list_length(cstep->source_stepids) == 0)
3401+
if (cstep->source_stepids == NIL)
34013402
{
34023403
PartitionBoundInfo boundinfo = context->boundinfo;
3403-
int rangemax;
3404-
3405-
/*
3406-
* Add all valid offsets into the boundinfo->indexes array. For range
3407-
* partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
3408-
* valid entries; otherwise there are boundinfo->ndatums.
3409-
*/
3410-
rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
3411-
boundinfo->ndatums : boundinfo->ndatums - 1;
34123404

34133405
result->bound_offsets =
3414-
bms_add_range(result->bound_offsets, 0, rangemax);
3406+
bms_add_range(NULL, 0, boundinfo->nindexes - 1);
34153407
result->scan_default = partition_bound_has_default(boundinfo);
34163408
result->scan_null = partition_bound_accepts_nulls(boundinfo);
34173409
return result;

src/include/partitioning/partbounds.h

+18-11
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ struct RelOptInfo; /* avoid including pathnodes.h here */
3030
* In the case of range partitioning, ndatums will typically be far less than
3131
* 2 * nparts, because a partition's upper bound and the next partition's lower
3232
* bound are the same in most common cases, and we only store one of them (the
33-
* upper bound). In case of hash partitioning, ndatums will be same as the
33+
* upper bound). In case of hash partitioning, ndatums will be the same as the
3434
* number of partitions.
3535
*
3636
* For range and list partitioned tables, datums is an array of datum-tuples
@@ -46,24 +46,31 @@ struct RelOptInfo; /* avoid including pathnodes.h here */
4646
* the partition key's operator classes and collations.
4747
*
4848
* In the case of list partitioning, the indexes array stores one entry for
49-
* every datum, which is the index of the partition that accepts a given datum.
50-
* In case of range partitioning, it stores one entry per distinct range
51-
* datum, which is the index of the partition for which a given datum
52-
* is an upper bound. In the case of hash partitioning, the number of the
53-
* entries in the indexes array is same as the greatest modulus amongst all
54-
* partitions. For a given partition key datum-tuple, the index of the
55-
* partition which would accept that datum-tuple would be given by the entry
56-
* pointed by remainder produced when hash value of the datum-tuple is divided
57-
* by the greatest modulus.
49+
* each datum-array entry, which is the index of the partition that accepts
50+
* rows matching that datum. So nindexes == ndatums.
51+
*
52+
* In the case of range partitioning, the indexes array stores one entry per
53+
* distinct range datum, which is the index of the partition for which that
54+
* datum is an upper bound (or -1 for a "gap" that has no partition). It is
55+
* convenient to have an extra -1 entry representing values above the last
56+
* range datum, so nindexes == ndatums + 1.
57+
*
58+
* In the case of hash partitioning, the number of entries in the indexes
59+
* array is the same as the greatest modulus amongst all partitions (which
60+
* is a multiple of all partition moduli), so nindexes == greatest modulus.
61+
* The indexes array is indexed according to the hash key's remainder modulo
62+
* the greatest modulus, and it contains either the partition index accepting
63+
* that remainder, or -1 if there is no partition for that remainder.
5864
*/
5965
typedef struct PartitionBoundInfoData
6066
{
6167
char strategy; /* hash, list or range? */
62-
int ndatums; /* Length of the datums following array */
68+
int ndatums; /* Length of the datums[] array */
6369
Datum **datums;
6470
PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
6571
* NULL for hash and list partitioned
6672
* tables */
73+
int nindexes; /* Length of the indexes[] array */
6774
int *indexes; /* Partition indexes */
6875
int null_index; /* Index of the null-accepting partition; -1
6976
* if there isn't one */

0 commit comments

Comments
 (0)