@@ -1059,6 +1059,148 @@ ldelete:;
1059
1059
return NULL ;
1060
1060
}
1061
1061
1062
+ /*
1063
+ * ExecCrossPartitionUpdate --- Move an updated tuple to another partition.
1064
+ *
1065
+ * This works by first deleting the old tuple from the current partition,
1066
+ * followed by inserting the new tuple into the root parent table, that is,
1067
+ * mtstate->rootResultRelInfo. It will be re-routed from there to the
1068
+ * correct partition.
1069
+ *
1070
+ * Returns true if the tuple has been successfully moved, or if it's found
1071
+ * that the tuple was concurrently deleted so there's nothing more to do
1072
+ * for the caller.
1073
+ *
1074
+ * False is returned if the tuple we're trying to move is found to have been
1075
+ * concurrently updated. In that case, the caller must to check if the
1076
+ * updated tuple that's returned in *retry_slot still needs to be re-routed,
1077
+ * and call this function again or perform a regular update accordingly.
1078
+ */
1079
+ static bool
1080
+ ExecCrossPartitionUpdate (ModifyTableState * mtstate ,
1081
+ ResultRelInfo * resultRelInfo ,
1082
+ ItemPointer tupleid , HeapTuple oldtuple ,
1083
+ TupleTableSlot * slot , TupleTableSlot * planSlot ,
1084
+ EPQState * epqstate , bool canSetTag ,
1085
+ TupleTableSlot * * retry_slot ,
1086
+ TupleTableSlot * * inserted_tuple )
1087
+ {
1088
+ EState * estate = mtstate -> ps .state ;
1089
+ PartitionTupleRouting * proute = mtstate -> mt_partition_tuple_routing ;
1090
+ int map_index ;
1091
+ TupleConversionMap * tupconv_map ;
1092
+ TupleConversionMap * saved_tcs_map = NULL ;
1093
+ bool tuple_deleted ;
1094
+ TupleTableSlot * epqslot = NULL ;
1095
+
1096
+ * inserted_tuple = NULL ;
1097
+ * retry_slot = NULL ;
1098
+
1099
+ /*
1100
+ * Disallow an INSERT ON CONFLICT DO UPDATE that causes the original row
1101
+ * to migrate to a different partition. Maybe this can be implemented
1102
+ * some day, but it seems a fringe feature with little redeeming value.
1103
+ */
1104
+ if (((ModifyTable * ) mtstate -> ps .plan )-> onConflictAction == ONCONFLICT_UPDATE )
1105
+ ereport (ERROR ,
1106
+ (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
1107
+ errmsg ("invalid ON UPDATE specification" ),
1108
+ errdetail ("The result tuple would appear in a different partition than the original tuple." )));
1109
+
1110
+ /*
1111
+ * When an UPDATE is run on a leaf partition, we will not have partition
1112
+ * tuple routing set up. In that case, fail with partition constraint
1113
+ * violation error.
1114
+ */
1115
+ if (proute == NULL )
1116
+ ExecPartitionCheckEmitError (resultRelInfo , slot , estate );
1117
+
1118
+ /*
1119
+ * Row movement, part 1. Delete the tuple, but skip RETURNING processing.
1120
+ * We want to return rows from INSERT.
1121
+ */
1122
+ ExecDelete (mtstate , resultRelInfo , tupleid , oldtuple , planSlot ,
1123
+ epqstate , estate ,
1124
+ false, /* processReturning */
1125
+ false, /* canSetTag */
1126
+ true, /* changingPart */
1127
+ & tuple_deleted , & epqslot );
1128
+
1129
+ /*
1130
+ * For some reason if DELETE didn't happen (e.g. trigger prevented it, or
1131
+ * it was already deleted by self, or it was concurrently deleted by
1132
+ * another transaction), then we should skip the insert as well;
1133
+ * otherwise, an UPDATE could cause an increase in the total number of
1134
+ * rows across all partitions, which is clearly wrong.
1135
+ *
1136
+ * For a normal UPDATE, the case where the tuple has been the subject of a
1137
+ * concurrent UPDATE or DELETE would be handled by the EvalPlanQual
1138
+ * machinery, but for an UPDATE that we've translated into a DELETE from
1139
+ * this partition and an INSERT into some other partition, that's not
1140
+ * available, because CTID chains can't span relation boundaries. We
1141
+ * mimic the semantics to a limited extent by skipping the INSERT if the
1142
+ * DELETE fails to find a tuple. This ensures that two concurrent
1143
+ * attempts to UPDATE the same tuple at the same time can't turn one tuple
1144
+ * into two, and that an UPDATE of a just-deleted tuple can't resurrect
1145
+ * it.
1146
+ */
1147
+ if (!tuple_deleted )
1148
+ {
1149
+ /*
1150
+ * epqslot will be typically NULL. But when ExecDelete() finds that
1151
+ * another transaction has concurrently updated the same row, it
1152
+ * re-fetches the row, skips the delete, and epqslot is set to the
1153
+ * re-fetched tuple slot. In that case, we need to do all the checks
1154
+ * again.
1155
+ */
1156
+ if (TupIsNull (epqslot ))
1157
+ return true;
1158
+ else
1159
+ {
1160
+ * retry_slot = ExecFilterJunk (resultRelInfo -> ri_junkFilter , epqslot );
1161
+ return false;
1162
+ }
1163
+ }
1164
+
1165
+ /*
1166
+ * resultRelInfo is one of the per-subplan resultRelInfos. So we should
1167
+ * convert the tuple into root's tuple descriptor, since ExecInsert()
1168
+ * starts the search from root. The tuple conversion map list is in the
1169
+ * order of mtstate->resultRelInfo[], so to retrieve the one for this
1170
+ * resultRel, we need to know the position of the resultRel in
1171
+ * mtstate->resultRelInfo[].
1172
+ */
1173
+ map_index = resultRelInfo - mtstate -> resultRelInfo ;
1174
+ Assert (map_index >= 0 && map_index < mtstate -> mt_nplans );
1175
+ tupconv_map = tupconv_map_for_subplan (mtstate , map_index );
1176
+ if (tupconv_map != NULL )
1177
+ slot = execute_attr_map_slot (tupconv_map -> attrMap ,
1178
+ slot ,
1179
+ mtstate -> mt_root_tuple_slot );
1180
+
1181
+ /*
1182
+ * ExecInsert() may scribble on mtstate->mt_transition_capture, so save
1183
+ * the currently active map.
1184
+ */
1185
+ if (mtstate -> mt_transition_capture )
1186
+ saved_tcs_map = mtstate -> mt_transition_capture -> tcs_map ;
1187
+
1188
+ /* Tuple routing starts from the root table. */
1189
+ Assert (mtstate -> rootResultRelInfo != NULL );
1190
+ * inserted_tuple = ExecInsert (mtstate , mtstate -> rootResultRelInfo , slot ,
1191
+ planSlot , estate , canSetTag );
1192
+
1193
+ /* Clear the INSERT's tuple and restore the saved map. */
1194
+ if (mtstate -> mt_transition_capture )
1195
+ {
1196
+ mtstate -> mt_transition_capture -> tcs_original_insert_tuple = NULL ;
1197
+ mtstate -> mt_transition_capture -> tcs_map = saved_tcs_map ;
1198
+ }
1199
+
1200
+ /* We're done moving. */
1201
+ return true;
1202
+ }
1203
+
1062
1204
/* ----------------------------------------------------------------
1063
1205
* ExecUpdate
1064
1206
*
@@ -1212,119 +1354,28 @@ lreplace:;
1212
1354
*/
1213
1355
if (partition_constraint_failed )
1214
1356
{
1215
- bool tuple_deleted ;
1216
- TupleTableSlot * ret_slot ;
1217
- TupleTableSlot * epqslot = NULL ;
1218
- PartitionTupleRouting * proute = mtstate -> mt_partition_tuple_routing ;
1219
- int map_index ;
1220
- TupleConversionMap * tupconv_map ;
1221
- TupleConversionMap * saved_tcs_map = NULL ;
1222
-
1223
- /*
1224
- * Disallow an INSERT ON CONFLICT DO UPDATE that causes the
1225
- * original row to migrate to a different partition. Maybe this
1226
- * can be implemented some day, but it seems a fringe feature with
1227
- * little redeeming value.
1228
- */
1229
- if (((ModifyTable * ) mtstate -> ps .plan )-> onConflictAction == ONCONFLICT_UPDATE )
1230
- ereport (ERROR ,
1231
- (errcode (ERRCODE_FEATURE_NOT_SUPPORTED ),
1232
- errmsg ("invalid ON UPDATE specification" ),
1233
- errdetail ("The result tuple would appear in a different partition than the original tuple." )));
1234
-
1235
- /*
1236
- * When an UPDATE is run on a leaf partition, we will not have
1237
- * partition tuple routing set up. In that case, fail with
1238
- * partition constraint violation error.
1239
- */
1240
- if (proute == NULL )
1241
- ExecPartitionCheckEmitError (resultRelInfo , slot , estate );
1242
-
1243
- /*
1244
- * Row movement, part 1. Delete the tuple, but skip RETURNING
1245
- * processing. We want to return rows from INSERT.
1246
- */
1247
- ExecDelete (mtstate , resultRelInfo , tupleid , oldtuple , planSlot ,
1248
- epqstate , estate ,
1249
- false, /* processReturning */
1250
- false, /* canSetTag */
1251
- true, /* changingPart */
1252
- & tuple_deleted , & epqslot );
1253
-
1254
- /*
1255
- * For some reason if DELETE didn't happen (e.g. trigger prevented
1256
- * it, or it was already deleted by self, or it was concurrently
1257
- * deleted by another transaction), then we should skip the insert
1258
- * as well; otherwise, an UPDATE could cause an increase in the
1259
- * total number of rows across all partitions, which is clearly
1260
- * wrong.
1261
- *
1262
- * For a normal UPDATE, the case where the tuple has been the
1263
- * subject of a concurrent UPDATE or DELETE would be handled by
1264
- * the EvalPlanQual machinery, but for an UPDATE that we've
1265
- * translated into a DELETE from this partition and an INSERT into
1266
- * some other partition, that's not available, because CTID chains
1267
- * can't span relation boundaries. We mimic the semantics to a
1268
- * limited extent by skipping the INSERT if the DELETE fails to
1269
- * find a tuple. This ensures that two concurrent attempts to
1270
- * UPDATE the same tuple at the same time can't turn one tuple
1271
- * into two, and that an UPDATE of a just-deleted tuple can't
1272
- * resurrect it.
1273
- */
1274
- if (!tuple_deleted )
1275
- {
1276
- /*
1277
- * epqslot will be typically NULL. But when ExecDelete()
1278
- * finds that another transaction has concurrently updated the
1279
- * same row, it re-fetches the row, skips the delete, and
1280
- * epqslot is set to the re-fetched tuple slot. In that case,
1281
- * we need to do all the checks again.
1282
- */
1283
- if (TupIsNull (epqslot ))
1284
- return NULL ;
1285
- else
1286
- {
1287
- slot = ExecFilterJunk (resultRelInfo -> ri_junkFilter , epqslot );
1288
- goto lreplace ;
1289
- }
1290
- }
1357
+ TupleTableSlot * inserted_tuple ,
1358
+ * retry_slot ;
1359
+ bool retry ;
1291
1360
1292
1361
/*
1293
- * resultRelInfo is one of the per-subplan resultRelInfos. So we
1294
- * should convert the tuple into root's tuple descriptor, since
1295
- * ExecInsert() starts the search from root. The tuple conversion
1296
- * map list is in the order of mtstate->resultRelInfo[], so to
1297
- * retrieve the one for this resultRel, we need to know the
1298
- * position of the resultRel in mtstate->resultRelInfo[].
1362
+ * ExecCrossPartitionUpdate will first DELETE the row from the
1363
+ * partition it's currently in and then insert it back into the
1364
+ * root table, which will re-route it to the correct partition.
1365
+ * The first part may have to be repeated if it is detected that
1366
+ * the tuple we're trying to move has been concurrently updated.
1299
1367
*/
1300
- map_index = resultRelInfo - mtstate -> resultRelInfo ;
1301
- Assert (map_index >= 0 && map_index < mtstate -> mt_nplans );
1302
- tupconv_map = tupconv_map_for_subplan (mtstate , map_index );
1303
- if (tupconv_map != NULL )
1304
- slot = execute_attr_map_slot (tupconv_map -> attrMap ,
1305
- slot ,
1306
- mtstate -> mt_root_tuple_slot );
1307
-
1308
- /*
1309
- * ExecInsert() may scribble on mtstate->mt_transition_capture, so
1310
- * save the currently active map.
1311
- */
1312
- if (mtstate -> mt_transition_capture )
1313
- saved_tcs_map = mtstate -> mt_transition_capture -> tcs_map ;
1314
-
1315
- /* Tuple routing starts from the root table. */
1316
- Assert (mtstate -> rootResultRelInfo != NULL );
1317
- ret_slot = ExecInsert (mtstate , mtstate -> rootResultRelInfo , slot ,
1318
- planSlot , estate , canSetTag );
1319
-
1320
- /* Clear the INSERT's tuple and restore the saved map. */
1321
- if (mtstate -> mt_transition_capture )
1368
+ retry = !ExecCrossPartitionUpdate (mtstate , resultRelInfo , tupleid ,
1369
+ oldtuple , slot , planSlot ,
1370
+ epqstate , canSetTag ,
1371
+ & retry_slot , & inserted_tuple );
1372
+ if (retry )
1322
1373
{
1323
- mtstate -> mt_transition_capture -> tcs_original_insert_tuple = NULL ;
1324
- mtstate -> mt_transition_capture -> tcs_map = saved_tcs_map ;
1374
+ slot = retry_slot ;
1375
+ goto lreplace ;
1325
1376
}
1326
1377
1327
- return ret_slot ;
1378
+ return inserted_tuple ;
1328
1379
}
1329
1380
1330
1381
/*
0 commit comments