Optimize creation of slots for FDW bulk inserts

tvondra · tvondra · commit b676ac443b6a · 2021-06-11T20:23:33.000+02:00
Commit b663a41 introduced bulk inserts for FDW, but the handling of tuple slots turned out to be problematic for two reasons. Firstly, the slots were re-created for each individual batch. Secondly, all slots referenced the same tuple descriptor - with reasonably small batches this is not an issue, but with large batches this triggers O(N^2) behavior in the resource owner code. These two issues work against each other - to reduce the number of times a slot has to be created/dropped, larger batches are needed. However, the larger the batch, the more expensive the resource owner gets. For practical batch sizes (100 - 1000) this would not be a big problem, as the benefits (latency savings) greatly exceed the resource owner costs. But for extremely large batches it might be much worse, possibly even losing with non-batching mode. Fixed by initializing tuple slots only once (and reusing them across batches) and by using a new tuple descriptor copy for each slot. Discussion: https://postgr.es/m/ebbbcc7d-4286-8c28-0272-61b4753af761%40enterprisedb.com
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
@@ -703,16 +703,31 @@ ExecInsert(ModifyTableState *mtstate,
 													 resultRelInfo->ri_BatchSize);
 			}
 
-			resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] =
-				MakeSingleTupleTableSlot(slot->tts_tupleDescriptor,
-										 slot->tts_ops);
-			ExecCopySlot(resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots],
-						 slot);
-			resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots] =
-				MakeSingleTupleTableSlot(planSlot->tts_tupleDescriptor,
-										 planSlot->tts_ops);
-			ExecCopySlot(resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots],
-						 planSlot);
+			/*
+			 * Initialize the batch slots. We don't know how many slots will be
+			 * needed, so we initialize them as the batch grows, and we keep
+			 * them across batches. To mitigate an inefficiency in how resource
+			 * owner handles objects with many references (as with many slots
+			 * all referencing the same tuple descriptor) we copy the tuple
+			 * descriptor for each slot.
+			 */
+			if (resultRelInfo->ri_NumSlots >= resultRelInfo->ri_NumSlotsInitialized)
+			{
+				TupleDesc tdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
+
+				resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] =
+					MakeSingleTupleTableSlot(tdesc, slot->tts_ops);
+				ExecCopySlot(resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots],
+							 slot);
+
+				resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots] =
+					MakeSingleTupleTableSlot(tdesc, planSlot->tts_ops);
+				ExecCopySlot(resultRelInfo->ri_PlanSlots[resultRelInfo->ri_NumSlots],
+							 planSlot);
+
+				/* remember how many batch slots we initialized */
+				resultRelInfo->ri_NumSlotsInitialized++;
+			}
 
 			resultRelInfo->ri_NumSlots++;
 
@@ -1034,12 +1049,6 @@ ExecBatchInsert(ModifyTableState *mtstate,
 
 	if (canSetTag && numInserted > 0)
 		estate->es_processed += numInserted;
-
-	for (i = 0; i < numSlots; i++)
-	{
-		ExecDropSingleTupleTableSlot(slots[i]);
-		ExecDropSingleTupleTableSlot(planSlots[i]);
-	}
 }
 
 /* ----------------------------------------------------------------
@@ -3162,13 +3171,24 @@ ExecEndModifyTable(ModifyTableState *node)
 	 */
 	for (i = 0; i < node->mt_nrels; i++)
 	{
+		int j;
 		ResultRelInfo *resultRelInfo = node->resultRelInfo + i;
 
 		if (!resultRelInfo->ri_usesFdwDirectModify &&
 			resultRelInfo->ri_FdwRoutine != NULL &&
 			resultRelInfo->ri_FdwRoutine->EndForeignModify != NULL)
 			resultRelInfo->ri_FdwRoutine->EndForeignModify(node->ps.state,
 														   resultRelInfo);
+
+		/*
+		 * Cleanup the initialized batch slots. This only matters for FDWs with
+		 * batching, but the other cases will have ri_NumSlotsInitialized == 0.
+		 */
+		for (j = 0; j < resultRelInfo->ri_NumSlotsInitialized; j++)
+		{
+			ExecDropSingleTupleTableSlot(resultRelInfo->ri_Slots[j]);
+			ExecDropSingleTupleTableSlot(resultRelInfo->ri_PlanSlots[j]);
+		}
 	}
 
 	/*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
@@ -462,6 +462,7 @@ typedef struct ResultRelInfo
 
 	/* batch insert stuff */
 	int			ri_NumSlots;	/* number of slots in the array */
+	int			ri_NumSlotsInitialized;	/* number of initialized slots */
 	int			ri_BatchSize;	/* max slots inserted in a single batch */
 	TupleTableSlot **ri_Slots;	/* input tuples for batch insert */
 	TupleTableSlot **ri_PlanSlots;