Add "split after new tuple" nbtree optimization.

petergeoghegan · petergeoghegan · commit f21668f328c8 · 2019-03-25T09:44:25.000-07:00
Add additional heuristics to the algorithm for locating an optimal split location. New logic identifies localized monotonically increasing values in indexes with multiple columns. When this insertion pattern is detected, page splits split just after the new item that provoked a page split (or apply leaf fillfactor in the style of a rightmost page split). This optimization is a variation of the long established leaf fillfactor optimization used during rightmost page splits. 50/50 page splits are only appropriate with a pattern of truly random insertions, where the average space utilization ends up at 65% - 70%. Without this patch, affected cases have leaf pages that are no more than about 50% full on average. Future insertions can never make use of the free space left behind. With this patch, affected cases have leaf pages that are about 90% full on average (assuming a fillfactor of 90). Localized monotonically increasing insertion patterns are presumed to be fairly common in real-world applications. There is a fair amount of anecdotal evidence for this. Both pg_depend system catalog indexes (pg_depend_depender_index and pg_depend_reference_index) are at least 20% smaller after the regression tests are run when the optimization is available. Furthermore, many of the indexes created by a fair use implementation of TPC-C for Postgres are consistently about 40% smaller when the optimization is available. Note that even pg_upgrade'd v3 indexes make use of this optimization. Author: Peter Geoghegan Reviewed-By: Heikki Linnakangas Discussion: https://postgr.es/m/CAH2-WzkpKeZJrXvR_p7VSY1b-s85E3gHyTbZQzR0BkJ5LrWF_A@mail.gmail.com
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
@@ -70,6 +70,9 @@ static void _bt_recsplitloc(FindSplitData *state,
 static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
 					bool usemult);
 static int	_bt_splitcmp(const void *arg1, const void *arg2);
+static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+					int leaffillfactor, bool *usemult);
+static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
 static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
 				 bool *newitemonleft);
 static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
@@ -249,9 +252,10 @@ _bt_findsplitloc(Relation rel,
 	 * Start search for a split point among list of legal split points.  Give
 	 * primary consideration to equalizing available free space in each half
 	 * of the split initially (start with default strategy), while applying
-	 * rightmost optimization where appropriate.  Either of the two other
-	 * fallback strategies may be required for cases with a large number of
-	 * duplicates around the original/space-optimal split point.
+	 * rightmost and split-after-new-item optimizations where appropriate.
+	 * Either of the two other fallback strategies may be required for cases
+	 * with a large number of duplicates around the original/space-optimal
+	 * split point.
 	 *
 	 * Default strategy gives some weight to suffix truncation in deciding a
 	 * split point on leaf pages.  It attempts to select a split point where a
@@ -273,6 +277,44 @@ _bt_findsplitloc(Relation rel,
 		usemult = true;
 		fillfactormult = leaffillfactor / 100.0;
 	}
+	else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
+	{
+		/*
+		 * New item inserted at rightmost point among a localized grouping on
+		 * a leaf page -- apply "split after new item" optimization, either by
+		 * applying leaf fillfactor multiplier, or by choosing the exact split
+		 * point that leaves the new item as last on the left. (usemult is set
+		 * for us.)
+		 */
+		if (usemult)
+		{
+			/* fillfactormult should be set based on leaf fillfactor */
+			fillfactormult = leaffillfactor / 100.0;
+		}
+		else
+		{
+			/* find precise split point after newitemoff */
+			for (int i = 0; i < state.nsplits; i++)
+			{
+				SplitPoint *split = state.splits + i;
+
+				if (split->newitemonleft &&
+					newitemoff == split->firstoldonright)
+				{
+					pfree(state.splits);
+					*newitemonleft = true;
+					return newitemoff;
+				}
+			}
+
+			/*
+			 * Cannot legally split after newitemoff; proceed with split
+			 * without using fillfactor multiplier.  This is defensive, and
+			 * should never be needed in practice.
+			 */
+			fillfactormult = 0.50;
+		}
+	}
 	else
 	{
 		/* Other leaf page.  50:50 page split. */
@@ -519,6 +561,172 @@ _bt_splitcmp(const void *arg1, const void *arg2)
 	return 0;
 }
 
+/*
+ * Subroutine to determine whether or not a non-rightmost leaf page should be
+ * split immediately after the would-be original page offset for the
+ * new/incoming tuple (or should have leaf fillfactor applied when new item is
+ * to the right on original page).  This is appropriate when there is a
+ * pattern of localized monotonically increasing insertions into a composite
+ * index, where leading attribute values form local groupings, and we
+ * anticipate further insertions of the same/current grouping (new item's
+ * grouping) in the near future.  This can be thought of as a variation on
+ * applying leaf fillfactor during rightmost leaf page splits, since cases
+ * that benefit will converge on packing leaf pages leaffillfactor% full over
+ * time.
+ *
+ * We may leave extra free space remaining on the rightmost page of a "most
+ * significant column" grouping of tuples if that grouping never ends up
+ * having future insertions that use the free space.  That effect is
+ * self-limiting; a future grouping that becomes the "nearest on the right"
+ * grouping of the affected grouping usually puts the extra free space to good
+ * use.
+ *
+ * Caller uses optimization when routine returns true, though the exact action
+ * taken by caller varies.  Caller uses original leaf page fillfactor in
+ * standard way rather than using the new item offset directly when *usemult
+ * was also set to true here.  Otherwise, caller applies optimization by
+ * locating the legal split point that makes the new tuple the very last tuple
+ * on the left side of the split.
+ */
+static bool
+_bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
+					int leaffillfactor, bool *usemult)
+{
+	int16		nkeyatts;
+	ItemId		itemid;
+	IndexTuple	tup;
+	int			keepnatts;
+
+	Assert(state->is_leaf && !state->is_rightmost);
+
+	nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
+
+	/* Single key indexes not considered here */
+	if (nkeyatts == 1)
+		return false;
+
+	/* Ascending insertion pattern never inferred when new item is first */
+	if (state->newitemoff == P_FIRSTKEY)
+		return false;
+
+	/*
+	 * Only apply optimization on pages with equisized tuples, since ordinal
+	 * keys are likely to be fixed-width.  Testing if the new tuple is
+	 * variable width directly might also work, but that fails to apply the
+	 * optimization to indexes with a numeric_ops attribute.
+	 *
+	 * Conclude that page has equisized tuples when the new item is the same
+	 * width as the smallest item observed during pass over page, and other
+	 * non-pivot tuples must be the same width as well.  (Note that the
+	 * possibly-truncated existing high key isn't counted in
+	 * olddataitemstotal, and must be subtracted from maxoff.)
+	 */
+	if (state->newitemsz != state->minfirstrightsz)
+		return false;
+	if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
+		return false;
+
+	/*
+	 * Avoid applying optimization when tuples are wider than a tuple
+	 * consisting of two non-NULL int8/int64 attributes (or four non-NULL
+	 * int4/int32 attributes)
+	 */
+	if (state->newitemsz >
+		MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
+		sizeof(ItemIdData))
+		return false;
+
+	/*
+	 * At least the first attribute's value must be equal to the corresponding
+	 * value in previous tuple to apply optimization.  New item cannot be a
+	 * duplicate, either.
+	 *
+	 * Handle case where new item is to the right of all items on the existing
+	 * page.  This is suggestive of monotonically increasing insertions in
+	 * itself, so the "heap TID adjacency" test is not applied here.
+	 */
+	if (state->newitemoff > maxoff)
+	{
+		itemid = PageGetItemId(state->page, maxoff);
+		tup = (IndexTuple) PageGetItem(state->page, itemid);
+		keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+		if (keepnatts > 1 && keepnatts <= nkeyatts)
+		{
+			*usemult = true;
+			return true;
+		}
+
+		return false;
+	}
+
+	/*
+	 * "Low cardinality leading column, high cardinality suffix column"
+	 * indexes with a random insertion pattern (e.g., an index with a boolean
+	 * column, such as an index on '(book_is_in_print, book_isbn)') present us
+	 * with a risk of consistently misapplying the optimization.  We're
+	 * willing to accept very occasional misapplication of the optimization,
+	 * provided the cases where we get it wrong are rare and self-limiting.
+	 *
+	 * Heap TID adjacency strongly suggests that the item just to the left was
+	 * inserted very recently, which limits overapplication of the
+	 * optimization.  Besides, all inappropriate cases triggered here will
+	 * still split in the middle of the page on average.
+	 */
+	itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
+	tup = (IndexTuple) PageGetItem(state->page, itemid);
+	/* Do cheaper test first */
+	if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
+		return false;
+	/* Check same conditions as rightmost item case, too */
+	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
+
+	if (keepnatts > 1 && keepnatts <= nkeyatts)
+	{
+		double		interp = (double) state->newitemoff / ((double) maxoff + 1);
+		double		leaffillfactormult = (double) leaffillfactor / 100.0;
+
+		/*
+		 * Don't allow caller to split after a new item when it will result in
+		 * a split point to the right of the point that a leaf fillfactor
+		 * split would use -- have caller apply leaf fillfactor instead
+		 */
+		*usemult = interp > leaffillfactormult;
+
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Subroutine for determining if two heap TIDS are "adjacent".
+ *
+ * Adjacent means that the high TID is very likely to have been inserted into
+ * heap relation immediately after the low TID, probably during the current
+ * transaction.
+ */
+static bool
+_bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
+{
+	BlockNumber lowblk,
+				highblk;
+
+	lowblk = ItemPointerGetBlockNumber(lowhtid);
+	highblk = ItemPointerGetBlockNumber(highhtid);
+
+	/* Make optimistic assumption of adjacency when heap blocks match */
+	if (lowblk == highblk)
+		return true;
+
+	/* When heap block one up, second offset should be FirstOffsetNumber */
+	if (lowblk + 1 == highblk &&
+		ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
+		return true;
+
+	return false;
+}
+
 /*
  * Subroutine to find the "best" split point among candidate split points.
  * The best split point is the split point with the lowest penalty among split