Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Fix up run-time partition pruning's use of relcache's partition data.
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 13 Jun 2018 16:03:19 +0000 (12:03 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 13 Jun 2018 16:03:26 +0000 (12:03 -0400)
The previous coding saved pointers into the partitioned table's relcache
entry, but then closed the relcache entry, causing those pointers to
nominally become dangling.  Actual trouble would be seen in the field
only if a relcache flush occurred mid-query, but that's hardly out of
the question.

While we could fix this by copying all the data in question at query
start, it seems better to just hold the relcache entry open for the
whole query.

While at it, improve the handling of support-function lookups: do that
once per query not once per pruning test.  There's still something to be
desired here, in that we fail to exploit the possibility of caching data
across queries in the fn_extra fields of the relcache's FmgrInfo structs,
which could happen if we just used those structs in-place rather than
copying them.  However, combining that with the possibility of per-query
lookups of cross-type comparison functions seems to require changes in the
APIs of a lot of the pruning support functions, so it's too invasive to
consider as part of this patch.  A win would ensue only for complex
partition key data types (e.g. arrays), so it may not be worth the
trouble.

David Rowley and Tom Lane

Discussion: https://postgr.es/m/17850.1528755844@sss.pgh.pa.us

src/backend/executor/execPartition.c
src/backend/executor/nodeAppend.c
src/backend/partitioning/partprune.c
src/backend/utils/cache/relcache.c
src/include/executor/execPartition.h
src/include/partitioning/partprune.h

index 33513ff1d155d013b8e38be7a49024c9f033cfe0..4eeee7c5e7f531ca74797cbd03b9699f9d61e351 100644 (file)
@@ -1357,11 +1357,14 @@ adjust_partition_tlist(List *tlist, TupleConversionMap *map)
  *
  * Functions:
  *
- * ExecSetupPartitionPruneState:
+ * ExecCreatePartitionPruneState:
  *     Creates the PartitionPruneState required by each of the two pruning
  *     functions.  Details stored include how to map the partition index
  *     returned by the partition pruning code into subplan indexes.
  *
+ * ExecDestroyPartitionPruneState:
+ *     Deletes a PartitionPruneState. Must be called during executor shutdown.
+ *
  * ExecFindInitialMatchingSubPlans:
  *     Returns indexes of matching subplans.  Partition pruning is attempted
  *     without any evaluation of expressions containing PARAM_EXEC Params.
@@ -1382,8 +1385,8 @@ adjust_partition_tlist(List *tlist, TupleConversionMap *map)
  */
 
 /*
- * ExecSetupPartitionPruneState
- *     Set up the data structure required for calling
+ * ExecCreatePartitionPruneState
+ *     Build the data structure required for calling
  *     ExecFindInitialMatchingSubPlans and ExecFindMatchingSubPlans.
  *
  * 'planstate' is the parent plan node's execution state.
@@ -1395,7 +1398,7 @@ adjust_partition_tlist(List *tlist, TupleConversionMap *map)
  * in each PartitionPruneInfo.
  */
 PartitionPruneState *
-ExecSetupPartitionPruneState(PlanState *planstate, List *partitionpruneinfo)
+ExecCreatePartitionPruneState(PlanState *planstate, List *partitionpruneinfo)
 {
    PartitionPruneState *prunestate;
    PartitionPruningData *prunedata;
@@ -1435,11 +1438,10 @@ ExecSetupPartitionPruneState(PlanState *planstate, List *partitionpruneinfo)
        PartitionPruningData *pprune = &prunedata[i];
        PartitionPruneContext *context = &pprune->context;
        PartitionDesc partdesc;
-       Relation    rel;
        PartitionKey partkey;
-       ListCell   *lc2;
        int         partnatts;
        int         n_steps;
+       ListCell   *lc2;
 
        /*
         * We must copy the subplan_map rather than pointing directly to the
@@ -1456,26 +1458,33 @@ ExecSetupPartitionPruneState(PlanState *planstate, List *partitionpruneinfo)
        pprune->present_parts = bms_copy(pinfo->present_parts);
 
        /*
-        * Grab some info from the table's relcache; lock was already obtained
-        * by ExecLockNonLeafAppendTables.
+        * We need to hold a pin on the partitioned table's relcache entry so
+        * that we can rely on its copies of the table's partition key and
+        * partition descriptor.  We need not get a lock though; one should
+        * have been acquired already by InitPlan or
+        * ExecLockNonLeafAppendTables.
         */
-       rel = relation_open(pinfo->reloid, NoLock);
+       context->partrel = relation_open(pinfo->reloid, NoLock);
 
-       partkey = RelationGetPartitionKey(rel);
-       partdesc = RelationGetPartitionDesc(rel);
+       partkey = RelationGetPartitionKey(context->partrel);
+       partdesc = RelationGetPartitionDesc(context->partrel);
+       n_steps = list_length(pinfo->pruning_steps);
 
        context->strategy = partkey->strategy;
        context->partnatts = partnatts = partkey->partnatts;
-       context->partopfamily = partkey->partopfamily;
-       context->partopcintype = partkey->partopcintype;
+       context->nparts = pinfo->nparts;
+       context->boundinfo = partdesc->boundinfo;
        context->partcollation = partkey->partcollation;
        context->partsupfunc = partkey->partsupfunc;
-       context->nparts = pinfo->nparts;
-       context->boundinfo = partition_bounds_copy(partdesc->boundinfo, partkey);
+
+       /* We'll look up type-specific support functions as needed */
+       context->stepcmpfuncs = (FmgrInfo *)
+           palloc0(sizeof(FmgrInfo) * n_steps * partnatts);
+
+       context->ppccontext = CurrentMemoryContext;
        context->planstate = planstate;
 
        /* Initialize expression state for each expression we need */
-       n_steps = list_length(pinfo->pruning_steps);
        context->exprstates = (ExprState **)
            palloc0(sizeof(ExprState *) * n_steps * partnatts);
        foreach(lc2, pinfo->pruning_steps)
@@ -1527,14 +1536,32 @@ ExecSetupPartitionPruneState(PlanState *planstate, List *partitionpruneinfo)
        prunestate->execparamids = bms_add_members(prunestate->execparamids,
                                                   pinfo->execparamids);
 
-       relation_close(rel, NoLock);
-
        i++;
    }
 
    return prunestate;
 }
 
+/*
+ * ExecDestroyPartitionPruneState
+ *     Release resources at plan shutdown.
+ *
+ * We don't bother to free any memory here, since the whole executor context
+ * will be going away shortly.  We do need to release our relcache pins.
+ */
+void
+ExecDestroyPartitionPruneState(PartitionPruneState *prunestate)
+{
+   int         i;
+
+   for (i = 0; i < prunestate->num_partprunedata; i++)
+   {
+       PartitionPruningData *pprune = &prunestate->partprunedata[i];
+
+       relation_close(pprune->context.partrel, NoLock);
+   }
+}
+
 /*
  * ExecFindInitialMatchingSubPlans
  *     Identify the set of subplans that cannot be eliminated by initial
index 6dd53e90ba8ec53c9251d8dc3e175ec41f1c593d..5ce4fb43e1a3caf83c61cb96c06af33d4c48ef20 100644 (file)
@@ -136,8 +136,10 @@ ExecInitAppend(Append *node, EState *estate, int eflags)
        /* We may need an expression context to evaluate partition exprs */
        ExecAssignExprContext(estate, &appendstate->ps);
 
-       prunestate = ExecSetupPartitionPruneState(&appendstate->ps,
-                                                 node->part_prune_infos);
+       /* Create the working data structure for pruning. */
+       prunestate = ExecCreatePartitionPruneState(&appendstate->ps,
+                                                  node->part_prune_infos);
+       appendstate->as_prune_state = prunestate;
 
        /* Perform an initial partition prune, if required. */
        if (prunestate->do_initial_prune)
@@ -178,8 +180,6 @@ ExecInitAppend(Append *node, EState *estate, int eflags)
         */
        if (!prunestate->do_exec_prune)
            appendstate->as_valid_subplans = bms_add_range(NULL, 0, nplans - 1);
-
-       appendstate->as_prune_state = prunestate;
    }
    else
    {
@@ -330,6 +330,12 @@ ExecEndAppend(AppendState *node)
     */
    for (i = 0; i < nplans; i++)
        ExecEndNode(appendplans[i]);
+
+   /*
+    * release any resources associated with run-time pruning
+    */
+   if (node->as_prune_state)
+       ExecDestroyPartitionPruneState(node->as_prune_state);
 }
 
 void
index 856bdd3a144ce40f33803f3b20e4aff96f7f2081..480b22e043ec51d619b03226e5370507ba9da644 100644 (file)
@@ -436,16 +436,20 @@ prune_append_rel_partitions(RelOptInfo *rel)
    if (contradictory)
        return NULL;
 
+   /* Set up PartitionPruneContext */
    context.strategy = rel->part_scheme->strategy;
    context.partnatts = rel->part_scheme->partnatts;
-   context.partopfamily = rel->part_scheme->partopfamily;
-   context.partopcintype = rel->part_scheme->partopcintype;
-   context.partcollation = rel->part_scheme->partcollation;
-   context.partsupfunc = rel->part_scheme->partsupfunc;
    context.nparts = rel->nparts;
    context.boundinfo = rel->boundinfo;
+   context.partcollation = rel->part_scheme->partcollation;
+   context.partsupfunc = rel->part_scheme->partsupfunc;
+   context.stepcmpfuncs = (FmgrInfo *) palloc0(sizeof(FmgrInfo) *
+                                               context.partnatts *
+                                               list_length(pruning_steps));
+   context.ppccontext = CurrentMemoryContext;
 
    /* These are not valid when being called from the planner */
+   context.partrel = NULL;
    context.planstate = NULL;
    context.exprstates = NULL;
    context.exprhasexecparam = NULL;
@@ -2809,7 +2813,8 @@ perform_pruning_base_step(PartitionPruneContext *context,
    int         keyno,
                nvalues;
    Datum       values[PARTITION_MAX_KEYS];
-   FmgrInfo    partsupfunc[PARTITION_MAX_KEYS];
+   FmgrInfo   *partsupfunc;
+   int         stateidx;
 
    /*
     * There better be the same number of expressions and compare functions.
@@ -2844,7 +2849,6 @@ perform_pruning_base_step(PartitionPruneContext *context,
        if (lc1 != NULL)
        {
            Expr       *expr;
-           int         stateidx;
            Datum       datum;
            bool        isnull;
 
@@ -2873,19 +2877,25 @@ perform_pruning_base_step(PartitionPruneContext *context,
                    return result;
                }
 
-               /*
-                * If we're going to need a different comparison function than
-                * the one cached in the PartitionKey, we'll need to look up
-                * the FmgrInfo.
-                */
+               /* Set up the stepcmpfuncs entry, unless we already did */
                cmpfn = lfirst_oid(lc2);
                Assert(OidIsValid(cmpfn));
-               if (cmpfn != context->partsupfunc[keyno].fn_oid)
-                   fmgr_info(cmpfn, &partsupfunc[keyno]);
-               else
-                   fmgr_info_copy(&partsupfunc[keyno],
-                                  &context->partsupfunc[keyno],
-                                  CurrentMemoryContext);
+               if (cmpfn != context->stepcmpfuncs[stateidx].fn_oid)
+               {
+                   /*
+                    * If the needed support function is the same one cached
+                    * in the relation's partition key, copy the cached
+                    * FmgrInfo.  Otherwise (i.e., when we have a cross-type
+                    * comparison), an actual lookup is required.
+                    */
+                   if (cmpfn == context->partsupfunc[keyno].fn_oid)
+                       fmgr_info_copy(&context->stepcmpfuncs[stateidx],
+                                      &context->partsupfunc[keyno],
+                                      context->ppccontext);
+                   else
+                       fmgr_info_cxt(cmpfn, &context->stepcmpfuncs[stateidx],
+                                     context->ppccontext);
+               }
 
                values[keyno] = datum;
                nvalues++;
@@ -2896,6 +2906,13 @@ perform_pruning_base_step(PartitionPruneContext *context,
        }
    }
 
+   /*
+    * Point partsupfunc to the entry for the 0th key of this step; the
+    * additional support functions, if any, follow consecutively.
+    */
+   stateidx = PruneCxtStateIdx(context->partnatts, opstep->step.step_id, 0);
+   partsupfunc = &context->stepcmpfuncs[stateidx];
+
    switch (context->strategy)
    {
        case PARTITION_STRATEGY_HASH:
index 7fed394f55283beb534122b42e3a5dffc362738d..d85dc925057586ad9c6d4e5b4b2b9d5adb58daf8 100644 (file)
@@ -2471,6 +2471,7 @@ RelationClearRelation(Relation relation, bool rebuild)
        keep_tupdesc = equalTupleDescs(relation->rd_att, newrel->rd_att);
        keep_rules = equalRuleLocks(relation->rd_rules, newrel->rd_rules);
        keep_policies = equalRSDesc(relation->rd_rsdesc, newrel->rd_rsdesc);
+       /* partkey is immutable once set up, so we can always keep it */
        keep_partkey = (relation->rd_partkey != NULL);
        keep_partdesc = equalPartitionDescs(relation->rd_partkey,
                                            relation->rd_partdesc,
@@ -2515,7 +2516,7 @@ RelationClearRelation(Relation relation, bool rebuild)
        SWAPFIELD(Form_pg_class, rd_rel);
        /* ... but actually, we don't have to update newrel->rd_rel */
        memcpy(relation->rd_rel, newrel->rd_rel, CLASS_TUPLE_SIZE);
-       /* preserve old tupledesc and rules if no logical change */
+       /* preserve old tupledesc, rules, policies if no logical change */
        if (keep_tupdesc)
            SWAPFIELD(TupleDesc, rd_att);
        if (keep_rules)
@@ -2529,13 +2530,12 @@ RelationClearRelation(Relation relation, bool rebuild)
        SWAPFIELD(Oid, rd_toastoid);
        /* pgstat_info must be preserved */
        SWAPFIELD(struct PgStat_TableStatus *, pgstat_info);
-       /* partition key must be preserved, if we have one */
+       /* preserve old partitioning info if no logical change */
        if (keep_partkey)
        {
            SWAPFIELD(PartitionKey, rd_partkey);
            SWAPFIELD(MemoryContext, rd_partkeycxt);
        }
-       /* preserve old partdesc if no logical change */
        if (keep_partdesc)
        {
            SWAPFIELD(PartitionDesc, rd_partdesc);
index 71d639fe6525ff0d5d59101a11566740314c3922..862bf650602900eb7726caf238be2efb3ca3fd73 100644 (file)
@@ -208,8 +208,9 @@ extern HeapTuple ConvertPartitionTupleSlot(TupleConversionMap *map,
                          TupleTableSlot **p_my_slot);
 extern void ExecCleanupTupleRouting(ModifyTableState *mtstate,
                        PartitionTupleRouting *proute);
-extern PartitionPruneState *ExecSetupPartitionPruneState(PlanState *planstate,
-                            List *partitionpruneinfo);
+extern PartitionPruneState *ExecCreatePartitionPruneState(PlanState *planstate,
+                             List *partitionpruneinfo);
+extern void ExecDestroyPartitionPruneState(PartitionPruneState *prunestate);
 extern Bitmapset *ExecFindMatchingSubPlans(PartitionPruneState *prunestate);
 extern Bitmapset *ExecFindInitialMatchingSubPlans(PartitionPruneState *prunestate,
                                int nsubplans);
index e3b3bfb7c113dd47c8411287a7e574254be40565..09147b532c816dff80976f8844d67134d78afc7a 100644 (file)
 
 /*
  * PartitionPruneContext
+ *     Stores information needed at runtime for pruning computations
+ *     related to a single partitioned table.
  *
- * Information about a partitioned table needed to perform partition pruning.
+ * partrel         Relcache pointer for the partitioned table,
+ *                 if we have it open (else NULL).
+ * strategy            Partition strategy, e.g. LIST, RANGE, HASH.
+ * partnatts       Number of columns in the partition key.
+ * nparts          Number of partitions in this partitioned table.
+ * boundinfo       Partition boundary info for the partitioned table.
+ * partcollation   Array of partnatts elements, storing the collations of the
+ *                 partition key columns.
+ * partsupfunc     Array of FmgrInfos for the comparison or hashing functions
+ *                 associated with the partition keys (partnatts elements).
+ *                 (This points into the partrel's partition key, typically.)
+ * stepcmpfuncs        Array of FmgrInfos for the comparison or hashing function
+ *                 for each pruning step and partition key.
+ * ppccontext      Memory context holding this PartitionPruneContext's
+ *                 subsidiary data, such as the FmgrInfos.
+ * planstate       Points to the parent plan node's PlanState when called
+ *                 during execution; NULL when called from the planner.
+ * exprstates      Array of ExprStates, indexed as per PruneCtxStateIdx; one
+ *                 for each partition key in each pruning step.  Allocated if
+ *                 planstate is non-NULL, otherwise NULL.
+ * exprhasexecparam    Array of bools, each true if corresponding 'exprstate'
+ *                 expression contains any PARAM_EXEC Params.  (Can be NULL
+ *                 if planstate is NULL.)
+ * evalexecparams  True if it's safe to evaluate PARAM_EXEC Params.
  */
 typedef struct PartitionPruneContext
 {
-   /* Partition key information */
+   Relation    partrel;
    char        strategy;
    int         partnatts;
-   Oid        *partopfamily;
-   Oid        *partopcintype;
-   Oid        *partcollation;
-   FmgrInfo   *partsupfunc;
-
-   /* Number of partitions */
    int         nparts;
-
-   /* Partition boundary info */
    PartitionBoundInfo boundinfo;
-
-   /*
-    * This will be set when the context is used from the executor, to allow
-    * Params to be evaluated.
-    */
+   Oid        *partcollation;
+   FmgrInfo   *partsupfunc;
+   FmgrInfo   *stepcmpfuncs;
+   MemoryContext ppccontext;
    PlanState  *planstate;
-
-   /*
-    * Array of ExprStates, indexed as per PruneCtxStateIdx; one for each
-    * partkey in each pruning step.  Allocated if planstate is non-NULL,
-    * otherwise NULL.
-    */
    ExprState **exprstates;
-
-   /*
-    * Similar array of flags, each true if corresponding 'exprstate'
-    * expression contains any PARAM_EXEC Params.  (Can be NULL if planstate
-    * is NULL.)
-    */
    bool       *exprhasexecparam;
-
-   /* true if it's safe to evaluate PARAM_EXEC Params */
    bool        evalexecparams;
 } PartitionPruneContext;
 
+/*
+ * PruneCxtStateIdx() computes the correct index into the stepcmpfuncs[],
+ * exprstates[] and exprhasexecparam[] arrays for step step_id and
+ * partition key column keyno.  (Note: there is code that assumes the
+ * entries for a given step are sequential, so this is not chosen freely.)
+ */
 #define PruneCxtStateIdx(partnatts, step_id, keyno) \
    ((partnatts) * (step_id) + (keyno))