8
8
*
9
9
*
10
10
* IDENTIFICATION
11
- * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.149 2007/02/06 14:55:11 tgl Exp $
11
+ * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.150 2007/02/08 05:05:53 momjian Exp $
12
12
*
13
13
*-------------------------------------------------------------------------
14
14
*/
@@ -733,6 +733,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
733
733
rightoff ;
734
734
OffsetNumber maxoff ;
735
735
OffsetNumber i ;
736
+ bool isroot ;
736
737
737
738
rbuf = _bt_getbuf (rel , P_NEW , BT_WRITE );
738
739
origpage = BufferGetPage (buf );
@@ -747,6 +748,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
747
748
lopaque = (BTPageOpaque ) PageGetSpecialPointer (leftpage );
748
749
ropaque = (BTPageOpaque ) PageGetSpecialPointer (rightpage );
749
750
751
+ isroot = P_ISROOT (oopaque );
752
+
750
753
/* if we're splitting this page, it won't be the root when we're done */
751
754
/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
752
755
lopaque -> btpo_flags = oopaque -> btpo_flags ;
@@ -921,61 +924,116 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
921
924
MarkBufferDirty (sbuf );
922
925
}
923
926
927
+ /*
928
+ * By here, the original data page has been split into two new halves, and
929
+ * these are correct. The algorithm requires that the left page never
930
+ * move during a split, so we copy the new left page back on top of the
931
+ * original. Note that this is not a waste of time, since we also require
932
+ * (in the page management code) that the center of a page always be
933
+ * clean, and the most efficient way to guarantee this is just to compact
934
+ * the data by reinserting it into a new left page. (XXX the latter
935
+ * comment is probably obsolete.)
936
+ *
937
+ * We need to do this before writing the WAL record, so that XLogInsert can
938
+ * WAL log an image of the page if necessary.
939
+ */
940
+ PageRestoreTempPage (leftpage , origpage );
941
+
924
942
/* XLOG stuff */
925
943
if (!rel -> rd_istemp )
926
944
{
927
945
xl_btree_split xlrec ;
928
946
uint8 xlinfo ;
929
947
XLogRecPtr recptr ;
930
- XLogRecData rdata [4 ];
948
+ XLogRecData rdata [6 ];
949
+ XLogRecData * lastrdata ;
931
950
932
- xlrec .target .node = rel -> rd_node ;
933
- ItemPointerSet (& (xlrec .target .tid ), itup_blkno , itup_off );
951
+ xlrec .node = rel -> rd_node ;
952
+ xlrec .leftsib = BufferGetBlockNumber (buf );
953
+ xlrec .rightsib = BufferGetBlockNumber (rbuf );
954
+ xlrec .firstright = firstright ;
955
+ xlrec .rnext = ropaque -> btpo_next ;
956
+ xlrec .level = lopaque -> btpo .level ;
957
+
958
+ rdata [0 ].data = (char * ) & xlrec ;
959
+ rdata [0 ].len = SizeOfBtreeSplit ;
960
+ rdata [0 ].buffer = InvalidBuffer ;
961
+
962
+ lastrdata = & rdata [0 ];
963
+
964
+ /* Log downlink on non-leaf pages. */
965
+ if (lopaque -> btpo .level > 0 )
966
+ {
967
+ lastrdata -> next = lastrdata + 1 ;
968
+ lastrdata ++ ;
969
+
970
+ lastrdata -> data = (char * ) & newitem -> t_tid .ip_blkid ;
971
+ lastrdata -> len = sizeof (BlockIdData );
972
+ lastrdata -> buffer = InvalidBuffer ;
973
+ }
974
+
975
+ /* Log the new item, if it was inserted on the left page. If it was
976
+ * put on the right page, we don't need to explicitly WAL log it
977
+ * because it's included with all the other items on the right page.
978
+ */
979
+ lastrdata -> next = lastrdata + 1 ;
980
+ lastrdata ++ ;
934
981
if (newitemonleft )
935
- xlrec .otherblk = BufferGetBlockNumber (rbuf );
982
+ {
983
+ lastrdata -> data = (char * ) & newitemoff ;
984
+ lastrdata -> len = sizeof (OffsetNumber );
985
+ lastrdata -> buffer = buf ; /* backup block 1 */
986
+ lastrdata -> buffer_std = true;
987
+
988
+ lastrdata -> next = lastrdata + 1 ;
989
+ lastrdata ++ ;
990
+ lastrdata -> data = (char * )newitem ;
991
+ lastrdata -> len = newitemsz ;
992
+ lastrdata -> buffer = buf ; /* backup block 1 */
993
+ lastrdata -> buffer_std = true;
994
+ }
936
995
else
937
- xlrec .otherblk = BufferGetBlockNumber (buf );
938
- xlrec .leftblk = lopaque -> btpo_prev ;
939
- xlrec .rightblk = ropaque -> btpo_next ;
940
- xlrec .level = lopaque -> btpo .level ;
996
+ {
997
+ lastrdata -> data = NULL ;
998
+ lastrdata -> len = 0 ;
999
+ lastrdata -> buffer = buf ; /* backup block 1 */
1000
+ lastrdata -> buffer_std = true;
1001
+ }
941
1002
942
- /*
1003
+ /* Log the contents of the right page in the format understood by
1004
+ * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
1005
+ * because we're going to recreate the whole page anyway.
1006
+ *
943
1007
* Direct access to page is not good but faster - we should implement
944
1008
* some new func in page API. Note we only store the tuples
945
1009
* themselves, knowing that the item pointers are in the same order
946
1010
* and can be reconstructed by scanning the tuples. See comments for
947
1011
* _bt_restore_page().
948
1012
*/
949
- xlrec . leftlen = (( PageHeader ) leftpage ) -> pd_special -
950
- (( PageHeader ) leftpage ) -> pd_upper ;
1013
+ lastrdata -> next = lastrdata + 1 ;
1014
+ lastrdata ++ ;
951
1015
952
- rdata [0 ].data = (char * ) & xlrec ;
953
- rdata [0 ].len = SizeOfBtreeSplit ;
954
- rdata [0 ].buffer = InvalidBuffer ;
955
- rdata [0 ].next = & (rdata [1 ]);
956
-
957
- rdata [1 ].data = (char * ) leftpage + ((PageHeader ) leftpage )-> pd_upper ;
958
- rdata [1 ].len = xlrec .leftlen ;
959
- rdata [1 ].buffer = InvalidBuffer ;
960
- rdata [1 ].next = & (rdata [2 ]);
961
-
962
- rdata [2 ].data = (char * ) rightpage + ((PageHeader ) rightpage )-> pd_upper ;
963
- rdata [2 ].len = ((PageHeader ) rightpage )-> pd_special -
1016
+ lastrdata -> data = (char * ) rightpage +
964
1017
((PageHeader ) rightpage )-> pd_upper ;
965
- rdata [2 ].buffer = InvalidBuffer ;
966
- rdata [2 ].next = NULL ;
1018
+ lastrdata -> len = ((PageHeader ) rightpage )-> pd_special -
1019
+ ((PageHeader ) rightpage )-> pd_upper ;
1020
+ lastrdata -> buffer = InvalidBuffer ;
967
1021
1022
+ /* Log the right sibling, because we've changed it's prev-pointer. */
968
1023
if (!P_RIGHTMOST (ropaque ))
969
1024
{
970
- rdata [2 ].next = & (rdata [3 ]);
971
- rdata [3 ].data = NULL ;
972
- rdata [3 ].len = 0 ;
973
- rdata [3 ].buffer = sbuf ;
974
- rdata [3 ].buffer_std = true;
975
- rdata [3 ].next = NULL ;
1025
+ lastrdata -> next = lastrdata + 1 ;
1026
+ lastrdata ++ ;
1027
+
1028
+ lastrdata -> data = NULL ;
1029
+ lastrdata -> len = 0 ;
1030
+ lastrdata -> buffer = sbuf ; /* backup block 2 */
1031
+ lastrdata -> buffer_std = true;
976
1032
}
977
1033
978
- if (P_ISROOT (oopaque ))
1034
+ lastrdata -> next = NULL ;
1035
+
1036
+ if (isroot )
979
1037
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L_ROOT : XLOG_BTREE_SPLIT_R_ROOT ;
980
1038
else
981
1039
xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R ;
@@ -993,24 +1051,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
993
1051
}
994
1052
}
995
1053
996
- /*
997
- * By here, the original data page has been split into two new halves, and
998
- * these are correct. The algorithm requires that the left page never
999
- * move during a split, so we copy the new left page back on top of the
1000
- * original. Note that this is not a waste of time, since we also require
1001
- * (in the page management code) that the center of a page always be
1002
- * clean, and the most efficient way to guarantee this is just to compact
1003
- * the data by reinserting it into a new left page. (XXX the latter
1004
- * comment is probably obsolete.)
1005
- *
1006
- * It's a bit weird that we don't fill in the left page till after writing
1007
- * the XLOG entry, but not really worth changing. Note that we use the
1008
- * origpage data (specifically its BTP_ROOT bit) while preparing the XLOG
1009
- * entry, so simply reshuffling the code won't do.
1010
- */
1011
-
1012
- PageRestoreTempPage (leftpage , origpage );
1013
-
1014
1054
END_CRIT_SECTION ();
1015
1055
1016
1056
/* release the old right sibling */
0 commit comments