Refactor nbtree fastpath optimization.

author Peter Geoghegan <pg@bowt.ie>

Wed, 18 Mar 2020 21:42:49 +0000 (14:42 -0700)

committer Peter Geoghegan <pg@bowt.ie>

Wed, 18 Mar 2020 21:42:49 +0000 (14:42 -0700)
author Peter Geoghegan <pg@bowt.ie>
Wed, 18 Mar 2020 21:42:49 +0000 (14:42 -0700)
committer Peter Geoghegan <pg@bowt.ie>
Wed, 18 Mar 2020 21:42:49 +0000 (14:42 -0700)
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c

index 966c0bb532fe65558209f357c09b3fc2c673ebb1..bb19c3d26370fdbc59558a9faf9bce1638a74303 100644 (file)
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -29,6 +29,7 @@
  #define BTREE_FASTPATH_MIN_LEVEL   2
  
  
+static BTStack _bt_search_insert(Relation rel, BTInsertState insertstate);
  static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
                                       Relation heapRel,
                                       IndexUniqueCheck checkUnique, bool *is_unique,
@@ -84,9 +85,7 @@ _bt_doinsert(Relation rel, IndexTuple itup,
     bool        is_unique = false;
     BTInsertStateData insertstate;
     BTScanInsert itup_key;
-   BTStack     stack = NULL;
-   Buffer      buf;
-   bool        fastpath;
+   BTStack     stack;
     bool        checkingunique = (checkUnique != UNIQUE_CHECK_NO);
  
     /* we need an insertion scan key to do our search, so build one */
@@ -137,102 +136,32 @@ _bt_doinsert(Relation rel, IndexTuple itup,
     insertstate.buf = InvalidBuffer;
     insertstate.postingoff = 0;
  
+search:
+
     /*
-    * It's very common to have an index on an auto-incremented or
-    * monotonically increasing value. In such cases, every insertion happens
-    * towards the end of the index. We try to optimize that case by caching
-    * the right-most leaf of the index. If our cached block is still the
-    * rightmost leaf, has enough free space to accommodate a new entry and
-    * the insertion key is strictly greater than the first key in this page,
-    * then we can safely conclude that the new key will be inserted in the
-    * cached block. So we simply search within the cached block and insert
-    * the key at the appropriate location. We call it a fastpath.
-    *
-    * Testing has revealed, though, that the fastpath can result in increased
-    * contention on the exclusive-lock on the rightmost leaf page. So we
-    * conditionally check if the lock is available. If it's not available
-    * then we simply abandon the fastpath and take the regular path. This
-    * makes sense because unavailability of the lock also signals that some
-    * other backend might be concurrently inserting into the page, thus
-    * reducing our chances to finding an insertion place in this page.
+    * Find and lock the leaf page that the tuple should be added to by
+    * searching from the root page.  insertstate.buf will hold a buffer that
+    * is locked in exclusive mode afterwards.
      */
-top:
-   fastpath = false;
-   if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
-   {
-       Page        page;
-       BTPageOpaque lpageop;
-
-       /*
-        * Conditionally acquire exclusive lock on the buffer before doing any
-        * checks. If we don't get the lock, we simply follow slowpath. If we
-        * do get the lock, this ensures that the index state cannot change,
-        * as far as the rightmost part of the index is concerned.
-        */
-       buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
-
-       if (ConditionalLockBuffer(buf))
-       {
-           _bt_checkpage(rel, buf);
-
-           page = BufferGetPage(buf);
-
-           lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
-
-           /*
-            * Check if the page is still the rightmost leaf page, has enough
-            * free space to accommodate the new tuple, and the insertion scan
-            * key is strictly greater than the first key on the page.  Note
-            * that _bt_insert_parent() has an assertion that catches leaf
-            * page splits that somehow follow from a fastpath insert.
-            */
-           if (P_ISLEAF(lpageop) && P_RIGHTMOST(lpageop) &&
-               !P_IGNORE(lpageop) &&
-               PageGetFreeSpace(page) > insertstate.itemsz &&
-               PageGetMaxOffsetNumber(page) >= P_FIRSTDATAKEY(lpageop) &&
-               _bt_compare(rel, itup_key, page, P_FIRSTDATAKEY(lpageop)) > 0)
-           {
-               fastpath = true;
-           }
-           else
-           {
-               _bt_relbuf(rel, buf);
-
-               /*
-                * Something did not work out. Just forget about the cached
-                * block and follow the normal path. It might be set again if
-                * the conditions are favourable.
-                */
-               RelationSetTargetBlock(rel, InvalidBlockNumber);
-           }
-       }
-       else
-       {
-           ReleaseBuffer(buf);
-
-           /*
-            * If someone's holding a lock, it's likely to change anyway, so
-            * don't try again until we get an updated rightmost leaf.
-            */
-           RelationSetTargetBlock(rel, InvalidBlockNumber);
-       }
-   }
-
-   if (!fastpath)
-   {
-       /*
-        * Find the first page containing this key.  Buffer returned by
-        * _bt_search() is locked in exclusive mode.
-        */
-       stack = _bt_search(rel, itup_key, &buf, BT_WRITE, NULL);
-   }
-
-   insertstate.buf = buf;
-   buf = InvalidBuffer;        /* insertstate.buf now owns the buffer */
+   stack = _bt_search_insert(rel, &insertstate);
  
     /*
-    * If we're not allowing duplicates, make sure the key isn't already in
-    * the index.
+    * checkingunique inserts are not allowed to go ahead when two tuples with
+    * equal key attribute values would be visible to new MVCC snapshots once
+    * the xact commits.  Check for conflicts in the locked page/buffer (if
+    * needed) here.
+    *
+    * It might be necessary to check a page to the right in _bt_check_unique,
+    * though that should be very rare.  In practice the first page the value
+    * could be on (with scantid omitted) is almost always also the only page
+    * that a matching tuple might be found on.  This is due to the behavior
+    * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
+    * only be allowed to cross a page boundary when there is no candidate
+    * leaf page split point that avoids it.  Also, _bt_check_unique can use
+    * the leaf page high key to determine that there will be no duplicates on
+    * the right sibling without actually visiting it (it uses the high key in
+    * cases where the new item happens to belong at the far right of the leaf
+    * page).
      *
      * NOTE: obviously, _bt_check_unique can only detect keys that are already
      * in the index; so it cannot defend against concurrent insertions of the
@@ -246,7 +175,7 @@ top:
      * insertion.  (This requires some care in _bt_findinsertloc.)
      *
      * If we must wait for another xact, we release the lock while waiting,
-    * and then must start over completely.
+    * and then must perform a new search.
      *
      * For a partial uniqueness check, we don't wait for the other xact. Just
      * let the tuple in and return false for possibly non-unique, or true for
@@ -260,7 +189,7 @@ top:
         xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
                                  &is_unique, &speculativeToken);
  
-       if (TransactionIdIsValid(xwait))
+       if (unlikely(TransactionIdIsValid(xwait)))
         {
             /* Have to wait for the other guy ... */
             _bt_relbuf(rel, insertstate.buf);
@@ -279,7 +208,7 @@ top:
             /* start over... */
             if (stack)
                 _bt_freestack(stack);
-           goto top;
+           goto search;
         }
  
         /* Uniqueness is established -- restore heap tid as scantid */
@@ -325,6 +254,112 @@ top:
     return is_unique;
  }
  
+/*
+ * _bt_search_insert() -- _bt_search() wrapper for inserts
+ *
+ * Search the tree for a particular scankey, or more precisely for the first
+ * leaf page it could be on.  Try to make use of the fastpath optimization's
+ * rightmost leaf page cache before actually searching the tree from the root
+ * page, though.
+ *
+ * Return value is a stack of parent-page pointers (though see notes about
+ * fastpath optimization and page splits below).  insertstate->buf is set to
+ * the address of the leaf-page buffer, which is write-locked and pinned in
+ * all cases (if necessary by creating a new empty root page for caller).
+ *
+ * The fastpath optimization avoids most of the work of searching the tree
+ * repeatedly when a single backend inserts successive new tuples on the
+ * rightmost leaf page of an index.  A backend cache of the rightmost leaf
+ * page is maintained within _bt_insertonpg(), and used here.  The cache is
+ * invalidated here when an insert of a non-pivot tuple must take place on a
+ * non-rightmost leaf page.
+ *
+ * The optimization helps with indexes on an auto-incremented field.  It also
+ * helps with indexes on datetime columns, as well as indexes with lots of
+ * NULL values.  (NULLs usually get inserted in the rightmost page for single
+ * column indexes, since they usually get treated as coming after everything
+ * else in the key space.  Individual NULL tuples will generally be placed on
+ * the rightmost leaf page due to the influence of the heap TID column.)
+ *
+ * Note that we avoid applying the optimization when there is insufficient
+ * space on the rightmost page to fit caller's new item.  This is necessary
+ * because we'll need to return a real descent stack when a page split is
+ * expected (actually, caller can cope with a leaf page split that uses a NULL
+ * stack, but that's very slow and so must be avoided).  Note also that the
+ * fastpath optimization acquires the lock on the page conditionally as a way
+ * of reducing extra contention when there are concurrent insertions into the
+ * rightmost page (we give up if we'd have to wait for the lock).  We assume
+ * that it isn't useful to apply the optimization when there is contention,
+ * since each per-backend cache won't stay valid for long.
+ */
+static BTStack
+_bt_search_insert(Relation rel, BTInsertState insertstate)
+{
+   Assert(insertstate->buf == InvalidBuffer);
+   Assert(!insertstate->bounds_valid);
+   Assert(insertstate->postingoff == 0);
+
+   if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
+   {
+       /* Simulate a _bt_getbuf() call with conditional locking */
+       insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
+       if (ConditionalLockBuffer(insertstate->buf))
+       {
+           Page        page;
+           BTPageOpaque lpageop;
+
+           _bt_checkpage(rel, insertstate->buf);
+           page = BufferGetPage(insertstate->buf);
+           lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+           /*
+            * Check if the page is still the rightmost leaf page and has
+            * enough free space to accommodate the new tuple.  Also check
+            * that the insertion scan key is strictly greater than the first
+            * non-pivot tuple on the page.  (Note that we expect itup_key's
+            * scantid to be unset when our caller is a checkingunique
+            * inserter.)
+            */
+           if (P_RIGHTMOST(lpageop) &&
+               P_ISLEAF(lpageop) &&
+               !P_IGNORE(lpageop) &&
+               PageGetFreeSpace(page) > insertstate->itemsz &&
+               PageGetMaxOffsetNumber(page) >= P_HIKEY &&
+               _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
+           {
+               /*
+                * Caller can use the fastpath optimization because cached
+                * block is still rightmost leaf page, which can fit caller's
+                * new tuple without splitting.  Keep block in local cache for
+                * next insert, and have caller use NULL stack.
+                *
+                * Note that _bt_insert_parent() has an assertion that catches
+                * leaf page splits that somehow follow from a fastpath insert
+                * (it should only be passed a NULL stack when it must deal
+                * with a concurrent root page split, and never because a NULL
+                * stack was returned here).
+                */
+               return NULL;
+           }
+
+           /* Page unsuitable for caller, drop lock and pin */
+           _bt_relbuf(rel, insertstate->buf);
+       }
+       else
+       {
+           /* Lock unavailable, drop pin */
+           ReleaseBuffer(insertstate->buf);
+       }
+
+       /* Forget block, since cache doesn't appear to be useful */
+       RelationSetTargetBlock(rel, InvalidBlockNumber);
+   }
+
+   /* Cannot use optimization -- descend tree, return proper descent stack */
+   return _bt_search(rel, insertstate->itup_key, &insertstate->buf, BT_WRITE,
+                     NULL);
+}
+
  /*
   * _bt_check_unique() -- Check for violation of unique index constraint
   *
@@ -1177,10 +1212,12 @@ _bt_insertonpg(Relation rel,
     }
     else
     {
+       bool        isleaf = P_ISLEAF(lpageop);
+       bool        isrightmost = P_RIGHTMOST(lpageop);
         Buffer      metabuf = InvalidBuffer;
         Page        metapg = NULL;
         BTMetaPageData *metad = NULL;
-       BlockNumber cachedBlock = InvalidBlockNumber;
+       BlockNumber blockcache;
  
         /*
          * If we are doing this insert because we split a page that was the
@@ -1191,7 +1228,8 @@ _bt_insertonpg(Relation rel,
          */
         if (split_only_page)
         {
-           Assert(!P_ISLEAF(lpageop));
+           Assert(!isleaf);
+           Assert(BufferIsValid(cbuf));
  
             metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
             metapg = BufferGetPage(metabuf);
@@ -1238,15 +1276,6 @@ _bt_insertonpg(Relation rel,
             MarkBufferDirty(cbuf);
         }
  
-       /*
-        * Cache the block information if we just inserted into the rightmost
-        * leaf page of the index and it's not the root page.  For very small
-        * index where root is also the leaf, there is no point trying for any
-        * optimization.
-        */
-       if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
-           cachedBlock = BufferGetBlockNumber(buf);
-
         /* XLOG stuff */
         if (RelationNeedsWAL(rel))
         {
@@ -1260,7 +1289,7 @@ _bt_insertonpg(Relation rel,
             XLogBeginInsert();
             XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
  
-           if (P_ISLEAF(lpageop) && postingoff == 0)
+           if (isleaf && postingoff == 0)
             {
                 /* Simple leaf insert */
                 xlinfo = XLOG_BTREE_INSERT_LEAF;
@@ -1329,36 +1358,42 @@ _bt_insertonpg(Relation rel,
             recptr = XLogInsert(RM_BTREE_ID, xlinfo);
  
             if (BufferIsValid(metabuf))
-           {
                 PageSetLSN(metapg, recptr);
-           }
             if (BufferIsValid(cbuf))
-           {
                 PageSetLSN(BufferGetPage(cbuf), recptr);
-           }
  
             PageSetLSN(page, recptr);
         }
  
         END_CRIT_SECTION();
  
-       /* release buffers */
+       /* Release subsidiary buffers */
         if (BufferIsValid(metabuf))
             _bt_relbuf(rel, metabuf);
         if (BufferIsValid(cbuf))
             _bt_relbuf(rel, cbuf);
+
+       /*
+        * Cache the block number if this is the rightmost leaf page.  Cache
+        * may be used by a future inserter within _bt_search_insert().
+        */
+       blockcache = InvalidBlockNumber;
+       if (isrightmost && isleaf && !P_ISROOT(lpageop))
+           blockcache = BufferGetBlockNumber(buf);
+
+       /* Release buffer for insertion target block */
         _bt_relbuf(rel, buf);
  
         /*
-        * If we decided to cache the insertion target block, then set it now.
-        * But before that, check for the height of the tree and don't go for
-        * the optimization for small indexes. We defer that check to this
-        * point to ensure that we don't call _bt_getrootheight while holding
-        * lock on any other block.
+        * If we decided to cache the insertion target block before releasing
+        * its buffer lock, then cache it now.  Check the height of the tree
+        * first, though.  We don't go for the optimization with small
+        * indexes.  Defer final check to this point to ensure that we don't
+        * call _bt_getrootheight while holding a buffer lock.
          */
-       if (BlockNumberIsValid(cachedBlock) &&
+       if (BlockNumberIsValid(blockcache) &&
             _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
-           RelationSetTargetBlock(rel, cachedBlock);
+           RelationSetTargetBlock(rel, blockcache);
     }
  
     /* be tidy */
@@ -2054,9 +2089,9 @@ _bt_insert_parent(Relation rel,
              * This is more of a performance issue than a correctness issue.
              * The fastpath won't have a descent stack.  Using a phony stack
              * here works, but never rely on that.  The fastpath should be
-            * rejected when the rightmost leaf page will split, since it's
-            * faster to go through _bt_search() and get a stack in the usual
-            * way.
+            * rejected within _bt_search_insert() when the rightmost leaf
+            * page will split, since it's faster to go through _bt_search()
+            * and get a stack in the usual way.
              */
             Assert(!(P_ISLEAF(lpageop) &&
                      BlockNumberIsValid(RelationGetTargetBlock(rel))));
author	Peter Geoghegan <pg@bowt.ie>
	Wed, 18 Mar 2020 21:42:49 +0000 (14:42 -0700)
committer	Peter Geoghegan <pg@bowt.ie>
	Wed, 18 Mar 2020 21:42:49 +0000 (14:42 -0700)