Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7f1921c

Browse files
committed
Fix hash partition pruning with asymmetric partition sets.
perform_pruning_combine_step() was not taught about the number of partition indexes used in hash partitioning; more embarrassingly, get_matching_hash_bounds() also had it wrong. These errors are masked in the common case where all the partitions have the same modulus and no partition is missing. However, with missing or unequal-size partitions, we could erroneously prune some partitions that need to be scanned, leading to silently wrong query answers. While a minimal-footprint fix for this could be to export get_partition_bound_num_indexes and make the incorrect functions use it, I'm of the opinion that that function should never have existed in the first place. It's not reasonable data structure design that PartitionBoundInfoData lacks any explicit record of the length of its indexes[] array. Perhaps that was all right when it could always be assumed equal to ndatums, but something should have been done about it as soon as that stopped being true. Putting in an explicit "nindexes" field makes both partition_bounds_equal() and partition_bounds_copy() simpler, safer, and faster than before, and removes explicit knowledge of the number-of-partition-indexes rules from some other places too. This change also makes get_hash_partition_greatest_modulus obsolete. I left that in place in case any external code uses it, but no core code does anymore. Per bug #16840 from Michał Albrycht. Back-patch to v11 where the hash partitioning code came in. (In the back branches, add the new field at the end of PartitionBoundInfoData to minimize ABI risks.) Discussion: https://postgr.es/m/16840-571a22976f829ad4@postgresql.org
1 parent 1449770 commit 7f1921c

File tree

6 files changed

+124
-141
lines changed

6 files changed

+124
-141
lines changed

src/backend/executor/execPartition.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1312,16 +1312,14 @@ get_partition_for_tuple(PartitionDispatch pd, Datum *values, bool *isnull)
13121312
{
13131313
case PARTITION_STRATEGY_HASH:
13141314
{
1315-
int greatest_modulus;
13161315
uint64 rowHash;
13171316

1318-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
13191317
rowHash = compute_partition_hash_value(key->partnatts,
13201318
key->partsupfunc,
13211319
key->partcollation,
13221320
values, isnull);
13231321

1324-
part_index = boundinfo->indexes[rowHash % greatest_modulus];
1322+
part_index = boundinfo->indexes[rowHash % boundinfo->nindexes];
13251323
}
13261324
break;
13271325

src/backend/partitioning/partbounds.c

Lines changed: 31 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,6 @@ static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
224224
Oid *partcollation,
225225
PartitionBoundInfo boundinfo,
226226
PartitionRangeBound *probe, bool *is_equal);
227-
static int get_partition_bound_num_indexes(PartitionBoundInfo b);
228227
static Expr *make_partition_op_expr(PartitionKey key, int keynum,
229228
uint16 strategy, Expr *arg1, Expr *arg2);
230229
static Oid get_partition_operator(PartitionKey key, int col,
@@ -398,6 +397,7 @@ create_hash_bounds(PartitionBoundSpec **boundspecs, int nparts,
398397

399398
boundinfo->ndatums = ndatums;
400399
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
400+
boundinfo->nindexes = greatest_modulus;
401401
boundinfo->indexes = (int *) palloc(greatest_modulus * sizeof(int));
402402
for (i = 0; i < greatest_modulus; i++)
403403
boundinfo->indexes[i] = -1;
@@ -530,6 +530,7 @@ create_list_bounds(PartitionBoundSpec **boundspecs, int nparts,
530530

531531
boundinfo->ndatums = ndatums;
532532
boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
533+
boundinfo->nindexes = ndatums;
533534
boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
534535

535536
/*
@@ -725,8 +726,9 @@ create_range_bounds(PartitionBoundSpec **boundspecs, int nparts,
725726

726727
/*
727728
* For range partitioning, an additional value of -1 is stored as the last
728-
* element.
729+
* element of the indexes[] array.
729730
*/
731+
boundinfo->nindexes = ndatums + 1;
730732
boundinfo->indexes = (int *) palloc((ndatums + 1) * sizeof(int));
731733

732734
for (i = 0; i < ndatums; i++)
@@ -807,45 +809,41 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
807809
if (b1->ndatums != b2->ndatums)
808810
return false;
809811

812+
if (b1->nindexes != b2->nindexes)
813+
return false;
814+
810815
if (b1->null_index != b2->null_index)
811816
return false;
812817

813818
if (b1->default_index != b2->default_index)
814819
return false;
815820

816-
if (b1->strategy == PARTITION_STRATEGY_HASH)
821+
/* For all partition strategies, the indexes[] arrays have to match */
822+
for (i = 0; i < b1->nindexes; i++)
817823
{
818-
int greatest_modulus = get_hash_partition_greatest_modulus(b1);
819-
820-
/*
821-
* If two hash partitioned tables have different greatest moduli,
822-
* their partition schemes don't match.
823-
*/
824-
if (greatest_modulus != get_hash_partition_greatest_modulus(b2))
824+
if (b1->indexes[i] != b2->indexes[i])
825825
return false;
826+
}
826827

828+
/* Finally, compare the datums[] arrays */
829+
if (b1->strategy == PARTITION_STRATEGY_HASH)
830+
{
827831
/*
828832
* We arrange the partitions in the ascending order of their moduli
829833
* and remainders. Also every modulus is factor of next larger
830834
* modulus. Therefore we can safely store index of a given partition
831835
* in indexes array at remainder of that partition. Also entries at
832836
* (remainder + N * modulus) positions in indexes array are all same
833-
* for (modulus, remainder) specification for any partition. Thus
834-
* datums array from both the given bounds are same, if and only if
835-
* their indexes array will be same. So, it suffices to compare
836-
* indexes array.
837-
*/
838-
for (i = 0; i < greatest_modulus; i++)
839-
if (b1->indexes[i] != b2->indexes[i])
840-
return false;
841-
842-
#ifdef USE_ASSERT_CHECKING
843-
844-
/*
845-
* Nonetheless make sure that the bounds are indeed same when the
837+
* for (modulus, remainder) specification for any partition. Thus the
838+
* datums arrays from the given bounds are the same, if and only if
839+
* their indexes arrays are the same. So, it suffices to compare the
840+
* indexes arrays.
841+
*
842+
* Nonetheless make sure that the bounds are indeed the same when the
846843
* indexes match. Hash partition bound stores modulus and remainder
847844
* at b1->datums[i][0] and b1->datums[i][1] position respectively.
848845
*/
846+
#ifdef USE_ASSERT_CHECKING
849847
for (i = 0; i < b1->ndatums; i++)
850848
Assert((b1->datums[i][0] == b2->datums[i][0] &&
851849
b1->datums[i][1] == b2->datums[i][1]));
@@ -891,15 +889,7 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
891889
parttypbyval[j], parttyplen[j]))
892890
return false;
893891
}
894-
895-
if (b1->indexes[i] != b2->indexes[i])
896-
return false;
897892
}
898-
899-
/* There are ndatums+1 indexes in case of range partitions */
900-
if (b1->strategy == PARTITION_STRATEGY_RANGE &&
901-
b1->indexes[i] != b2->indexes[i])
902-
return false;
903893
}
904894
return true;
905895
}
@@ -920,19 +910,18 @@ partition_bounds_copy(PartitionBoundInfo src,
920910
PartitionBoundInfo dest;
921911
int i;
922912
int ndatums;
913+
int nindexes;
923914
int partnatts;
924-
int num_indexes;
925915
bool hash_part;
926916
int natts;
927917

928918
dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData));
929919

930920
dest->strategy = src->strategy;
931921
ndatums = dest->ndatums = src->ndatums;
922+
nindexes = dest->nindexes = src->nindexes;
932923
partnatts = key->partnatts;
933924

934-
num_indexes = get_partition_bound_num_indexes(src);
935-
936925
/* List partitioned tables have only a single partition key. */
937926
Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);
938927

@@ -990,8 +979,8 @@ partition_bounds_copy(PartitionBoundInfo src,
990979
}
991980
}
992981

993-
dest->indexes = (int *) palloc(sizeof(int) * num_indexes);
994-
memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes);
982+
dest->indexes = (int *) palloc(sizeof(int) * nindexes);
983+
memcpy(dest->indexes, src->indexes, sizeof(int) * nindexes);
995984

996985
dest->null_index = src->null_index;
997986
dest->default_index = src->default_index;
@@ -2456,6 +2445,7 @@ build_merged_partition_bounds(char strategy, List *merged_datums,
24562445
}
24572446

24582447
Assert(list_length(merged_indexes) == ndatums);
2448+
merged_bounds->nindexes = ndatums;
24592449
merged_bounds->indexes = (int *) palloc(sizeof(int) * ndatums);
24602450
pos = 0;
24612451
foreach(lc, merged_indexes)
@@ -2889,7 +2879,7 @@ check_new_partition_bound(char *relname, Relation parent,
28892879
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
28902880
errmsg("every hash partition modulus must be a factor of the next larger modulus")));
28912881

2892-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
2882+
greatest_modulus = boundinfo->nindexes;
28932883
remainder = spec->remainder;
28942884

28952885
/*
@@ -3254,18 +3244,15 @@ check_default_partition_contents(Relation parent, Relation default_rel,
32543244
/*
32553245
* get_hash_partition_greatest_modulus
32563246
*
3257-
* Returns the greatest modulus of the hash partition bound. The greatest
3258-
* modulus will be at the end of the datums array because hash partitions are
3259-
* arranged in the ascending order of their moduli and remainders.
3247+
* Returns the greatest modulus of the hash partition bound.
3248+
* This is no longer used in the core code, but we keep it around
3249+
* in case external modules are using it.
32603250
*/
32613251
int
32623252
get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
32633253
{
32643254
Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
3265-
Assert(bound->datums && bound->ndatums > 0);
3266-
Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0);
3267-
3268-
return DatumGetInt32(bound->datums[bound->ndatums - 1][0]);
3255+
return bound->nindexes;
32693256
}
32703257

32713258
/*
@@ -3662,46 +3649,6 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
36623649
b1->lower, b2);
36633650
}
36643651

3665-
/*
3666-
* get_partition_bound_num_indexes
3667-
*
3668-
* Returns the number of the entries in the partition bound indexes array.
3669-
*/
3670-
static int
3671-
get_partition_bound_num_indexes(PartitionBoundInfo bound)
3672-
{
3673-
int num_indexes;
3674-
3675-
Assert(bound);
3676-
3677-
switch (bound->strategy)
3678-
{
3679-
case PARTITION_STRATEGY_HASH:
3680-
3681-
/*
3682-
* The number of the entries in the indexes array is same as the
3683-
* greatest modulus.
3684-
*/
3685-
num_indexes = get_hash_partition_greatest_modulus(bound);
3686-
break;
3687-
3688-
case PARTITION_STRATEGY_LIST:
3689-
num_indexes = bound->ndatums;
3690-
break;
3691-
3692-
case PARTITION_STRATEGY_RANGE:
3693-
/* Range partitioned table has an extra index. */
3694-
num_indexes = bound->ndatums + 1;
3695-
break;
3696-
3697-
default:
3698-
elog(ERROR, "unexpected partition strategy: %d",
3699-
(int) bound->strategy);
3700-
}
3701-
3702-
return num_indexes;
3703-
}
3704-
37053652
/*
37063653
* get_partition_operator
37073654
*

src/backend/partitioning/partprune.c

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -776,7 +776,10 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
776776
scan_default = final_result->scan_default;
777777
while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
778778
{
779-
int partindex = context->boundinfo->indexes[i];
779+
int partindex;
780+
781+
Assert(i < context->boundinfo->nindexes);
782+
partindex = context->boundinfo->indexes[i];
780783

781784
if (partindex < 0)
782785
{
@@ -2509,20 +2512,19 @@ get_matching_hash_bounds(PartitionPruneContext *context,
25092512
for (i = 0; i < partnatts; i++)
25102513
isnull[i] = bms_is_member(i, nullkeys);
25112514

2512-
greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
25132515
rowHash = compute_partition_hash_value(partnatts, partsupfunc, partcollation,
25142516
values, isnull);
25152517

2518+
greatest_modulus = boundinfo->nindexes;
25162519
if (partindices[rowHash % greatest_modulus] >= 0)
25172520
result->bound_offsets =
25182521
bms_make_singleton(rowHash % greatest_modulus);
25192522
}
25202523
else
25212524
{
2522-
/* Getting here means at least one hash partition exists. */
2523-
Assert(boundinfo->ndatums > 0);
2525+
/* Report all valid offsets into the boundinfo->indexes array. */
25242526
result->bound_offsets = bms_add_range(NULL, 0,
2525-
boundinfo->ndatums - 1);
2527+
boundinfo->nindexes - 1);
25262528
}
25272529

25282530
/*
@@ -3383,30 +3385,20 @@ perform_pruning_combine_step(PartitionPruneContext *context,
33833385
PartitionPruneStepCombine *cstep,
33843386
PruneStepResult **step_results)
33853387
{
3386-
ListCell *lc1;
3387-
PruneStepResult *result = NULL;
3388+
PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
33883389
bool firststep;
3390+
ListCell *lc1;
33893391

33903392
/*
33913393
* A combine step without any source steps is an indication to not perform
33923394
* any partition pruning. Return all datum indexes in that case.
33933395
*/
3394-
result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
3395-
if (list_length(cstep->source_stepids) == 0)
3396+
if (cstep->source_stepids == NIL)
33963397
{
33973398
PartitionBoundInfo boundinfo = context->boundinfo;
3398-
int rangemax;
3399-
3400-
/*
3401-
* Add all valid offsets into the boundinfo->indexes array. For range
3402-
* partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
3403-
* valid entries; otherwise there are boundinfo->ndatums.
3404-
*/
3405-
rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
3406-
boundinfo->ndatums : boundinfo->ndatums - 1;
34073399

34083400
result->bound_offsets =
3409-
bms_add_range(result->bound_offsets, 0, rangemax);
3401+
bms_add_range(NULL, 0, boundinfo->nindexes - 1);
34103402
result->scan_default = partition_bound_has_default(boundinfo);
34113403
result->scan_null = partition_bound_accepts_nulls(boundinfo);
34123404
return result;

src/include/partitioning/partbounds.h

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ struct RelOptInfo; /* avoid including pathnodes.h here */
3131
* In the case of range partitioning, ndatums will typically be far less than
3232
* 2 * nparts, because a partition's upper bound and the next partition's lower
3333
* bound are the same in most common cases, and we only store one of them (the
34-
* upper bound). In case of hash partitioning, ndatums will be same as the
34+
* upper bound). In case of hash partitioning, ndatums will be the same as the
3535
* number of partitions.
3636
*
3737
* For range and list partitioned tables, datums is an array of datum-tuples
@@ -47,20 +47,26 @@ struct RelOptInfo; /* avoid including pathnodes.h here */
4747
* the partition key's operator classes and collations.
4848
*
4949
* In the case of list partitioning, the indexes array stores one entry for
50-
* every datum, which is the index of the partition that accepts a given datum.
51-
* In case of range partitioning, it stores one entry per distinct range
52-
* datum, which is the index of the partition for which a given datum
53-
* is an upper bound. In the case of hash partitioning, the number of the
54-
* entries in the indexes array is same as the greatest modulus amongst all
55-
* partitions. For a given partition key datum-tuple, the index of the
56-
* partition which would accept that datum-tuple would be given by the entry
57-
* pointed by remainder produced when hash value of the datum-tuple is divided
58-
* by the greatest modulus.
50+
* each datum-array entry, which is the index of the partition that accepts
51+
* rows matching that datum. So nindexes == ndatums.
52+
*
53+
* In the case of range partitioning, the indexes array stores one entry per
54+
* distinct range datum, which is the index of the partition for which that
55+
* datum is an upper bound (or -1 for a "gap" that has no partition). It is
56+
* convenient to have an extra -1 entry representing values above the last
57+
* range datum, so nindexes == ndatums + 1.
58+
*
59+
* In the case of hash partitioning, the number of entries in the indexes
60+
* array is the same as the greatest modulus amongst all partitions (which
61+
* is a multiple of all partition moduli), so nindexes == greatest modulus.
62+
* The indexes array is indexed according to the hash key's remainder modulo
63+
* the greatest modulus, and it contains either the partition index accepting
64+
* that remainder, or -1 if there is no partition for that remainder.
5965
*/
6066
typedef struct PartitionBoundInfoData
6167
{
6268
char strategy; /* hash, list or range? */
63-
int ndatums; /* Length of the datums following array */
69+
int ndatums; /* Length of the datums[] array */
6470
Datum **datums;
6571
PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
6672
* NULL for hash and list partitioned
@@ -70,6 +76,7 @@ typedef struct PartitionBoundInfoData
7076
* if there isn't one */
7177
int default_index; /* Index of the default partition; -1 if there
7278
* isn't one */
79+
int nindexes; /* Length of the indexes[] array */
7380
} PartitionBoundInfoData;
7481

7582
#define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)

0 commit comments

Comments
 (0)