26
26
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
27
27
* Portions Copyright (c) 1994, Regents of the University of California
28
28
*
29
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.47 2008/08/01 13:16:08 alvherre Exp $
29
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.48 2008/10/20 19:18:18 alvherre Exp $
30
30
*
31
31
*-------------------------------------------------------------------------
32
32
*/
@@ -80,32 +80,182 @@ static int ZeroCLOGPage(int pageno, bool writeXlog);
80
80
static bool CLOGPagePrecedes (int page1 , int page2 );
81
81
static void WriteZeroPageXlogRec (int pageno );
82
82
static void WriteTruncateXlogRec (int pageno );
83
+ static void TransactionIdSetPageStatus (TransactionId xid , int nsubxids ,
84
+ TransactionId * subxids , XidStatus status ,
85
+ XLogRecPtr lsn , int pageno );
86
+ static void TransactionIdSetStatusBit (TransactionId xid , XidStatus status ,
87
+ XLogRecPtr lsn , int slotno );
88
+ static void set_status_by_pages (int nsubxids , TransactionId * subxids ,
89
+ XidStatus status , XLogRecPtr lsn );
83
90
84
91
85
92
/*
86
- * Record the final state of a transaction in the commit log.
93
+ * TransactionIdSetTreeStatus
94
+ *
95
+ * Record the final state of transaction entries in the commit log for
96
+ * a transaction and its subtransaction tree. Take care to ensure this is
97
+ * efficient, and as atomic as possible.
98
+ *
99
+ * xid is a single xid to set status for. This will typically be
100
+ * the top level transactionid for a top level commit or abort. It can
101
+ * also be a subtransaction when we record transaction aborts.
102
+ *
103
+ * subxids is an array of xids of length nsubxids, representing subtransactions
104
+ * in the tree of xid. In various cases nsubxids may be zero.
87
105
*
88
106
* lsn must be the WAL location of the commit record when recording an async
89
107
* commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
90
108
* caller guarantees the commit record is already flushed in that case. It
91
109
* should be InvalidXLogRecPtr for abort cases, too.
92
110
*
111
+ * In the commit case, atomicity is limited by whether all the subxids are in
112
+ * the same CLOG page as xid. If they all are, then the lock will be grabbed
113
+ * only once, and the status will be set to committed directly. Otherwise
114
+ * we must
115
+ * 1. set sub-committed all subxids that are not on the same page as the
116
+ * main xid
117
+ * 2. atomically set committed the main xid and the subxids on the same page
118
+ * 3. go over the first bunch again and set them committed
119
+ * Note that as far as concurrent checkers are concerned, main transaction
120
+ * commit as a whole is still atomic.
121
+ *
122
+ * Example:
123
+ * TransactionId t commits and has subxids t1, t2, t3, t4
124
+ * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
125
+ * 1. update pages2-3:
126
+ * page2: set t2,t3 as sub-committed
127
+ * page3: set t4 as sub-committed
128
+ * 2. update page1:
129
+ * set t1 as sub-committed,
130
+ * then set t as committed,
131
+ then set t1 as committed
132
+ * 3. update pages2-3:
133
+ * page2: set t2,t3 as committed
134
+ * page3: set t4 as committed
135
+ *
93
136
* NB: this is a low-level routine and is NOT the preferred entry point
94
- * for most uses; TransactionLogUpdate() in transam.c is the intended caller.
137
+ * for most uses; functions in transam.c are the intended callers.
138
+ *
139
+ * XXX Think about issuing FADVISE_WILLNEED on pages that we will need,
140
+ * but aren't yet in cache, as well as hinting pages not to fall out of
141
+ * cache yet.
95
142
*/
96
143
void
97
- TransactionIdSetStatus (TransactionId xid , XidStatus status , XLogRecPtr lsn )
144
+ TransactionIdSetTreeStatus (TransactionId xid , int nsubxids ,
145
+ TransactionId * subxids , XidStatus status , XLogRecPtr lsn )
146
+ {
147
+ int pageno = TransactionIdToPage (xid ); /* get page of parent */
148
+ int i ;
149
+
150
+ Assert (status == TRANSACTION_STATUS_COMMITTED ||
151
+ status == TRANSACTION_STATUS_ABORTED );
152
+
153
+ /*
154
+ * See how many subxids, if any, are on the same page as the parent, if any.
155
+ */
156
+ for (i = 0 ; i < nsubxids ; i ++ )
157
+ {
158
+ if (TransactionIdToPage (subxids [i ]) != pageno )
159
+ break ;
160
+ }
161
+
162
+ /*
163
+ * Do all items fit on a single page?
164
+ */
165
+ if (i == nsubxids )
166
+ {
167
+ /*
168
+ * Set the parent and all subtransactions in a single call
169
+ */
170
+ TransactionIdSetPageStatus (xid , nsubxids , subxids , status , lsn ,
171
+ pageno );
172
+ }
173
+ else
174
+ {
175
+ int nsubxids_on_first_page = i ;
176
+
177
+ /*
178
+ * If this is a commit then we care about doing this correctly (i.e.
179
+ * using the subcommitted intermediate status). By here, we know we're
180
+ * updating more than one page of clog, so we must mark entries that
181
+ * are *not* on the first page so that they show as subcommitted before
182
+ * we then return to update the status to fully committed.
183
+ *
184
+ * To avoid touching the first page twice, skip marking subcommitted
185
+ * for the subxids on that first page.
186
+ */
187
+ if (status == TRANSACTION_STATUS_COMMITTED )
188
+ set_status_by_pages (nsubxids - nsubxids_on_first_page ,
189
+ subxids + nsubxids_on_first_page ,
190
+ TRANSACTION_STATUS_SUB_COMMITTED , lsn );
191
+
192
+ /*
193
+ * Now set the parent and subtransactions on same page as the parent,
194
+ * if any
195
+ */
196
+ pageno = TransactionIdToPage (xid );
197
+ TransactionIdSetPageStatus (xid , nsubxids_on_first_page , subxids , status ,
198
+ lsn , pageno );
199
+
200
+ /*
201
+ * Now work through the rest of the subxids one clog page at a time,
202
+ * starting from the second page onwards, like we did above.
203
+ */
204
+ set_status_by_pages (nsubxids - nsubxids_on_first_page ,
205
+ subxids + nsubxids_on_first_page ,
206
+ status , lsn );
207
+ }
208
+ }
209
+
210
+ /*
211
+ * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
212
+ * transactions, chunking in the separate CLOG pages involved. We never
213
+ * pass the whole transaction tree to this function, only subtransactions
214
+ * that are on different pages to the top level transaction id.
215
+ */
216
+ static void
217
+ set_status_by_pages (int nsubxids , TransactionId * subxids ,
218
+ XidStatus status , XLogRecPtr lsn )
219
+ {
220
+ int pageno = TransactionIdToPage (subxids [0 ]);
221
+ int offset = 0 ;
222
+ int i = 0 ;
223
+
224
+ while (i < nsubxids )
225
+ {
226
+ int num_on_page = 0 ;
227
+
228
+ while (TransactionIdToPage (subxids [i ]) == pageno && i < nsubxids )
229
+ {
230
+ num_on_page ++ ;
231
+ i ++ ;
232
+ }
233
+
234
+ TransactionIdSetPageStatus (InvalidTransactionId ,
235
+ num_on_page , subxids + offset ,
236
+ status , lsn , pageno );
237
+ offset = i ;
238
+ pageno = TransactionIdToPage (subxids [offset ]);
239
+ }
240
+ }
241
+
242
+ /*
243
+ * Record the final state of transaction entries in the commit log for
244
+ * all entries on a single page. Atomic only on this page.
245
+ *
246
+ * Otherwise API is same as TransactionIdSetTreeStatus()
247
+ */
248
+ static void
249
+ TransactionIdSetPageStatus (TransactionId xid , int nsubxids ,
250
+ TransactionId * subxids , XidStatus status ,
251
+ XLogRecPtr lsn , int pageno )
98
252
{
99
- int pageno = TransactionIdToPage (xid );
100
- int byteno = TransactionIdToByte (xid );
101
- int bshift = TransactionIdToBIndex (xid ) * CLOG_BITS_PER_XACT ;
102
253
int slotno ;
103
- char * byteptr ;
104
- char byteval ;
254
+ int i ;
105
255
106
256
Assert (status == TRANSACTION_STATUS_COMMITTED ||
107
257
status == TRANSACTION_STATUS_ABORTED ||
108
- status == TRANSACTION_STATUS_SUB_COMMITTED );
258
+ ( status == TRANSACTION_STATUS_SUB_COMMITTED && ! TransactionIdIsValid ( xid )) );
109
259
110
260
LWLockAcquire (CLogControlLock , LW_EXCLUSIVE );
111
261
@@ -116,9 +266,62 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
116
266
* mustn't let it reach disk until we've done the appropriate WAL flush.
117
267
* But when lsn is invalid, it's OK to scribble on a page while it is
118
268
* write-busy, since we don't care if the update reaches disk sooner than
119
- * we think. Hence, pass write_ok = XLogRecPtrIsInvalid(lsn).
269
+ * we think.
120
270
*/
121
271
slotno = SimpleLruReadPage (ClogCtl , pageno , XLogRecPtrIsInvalid (lsn ), xid );
272
+
273
+ /*
274
+ * Set the main transaction id, if any.
275
+ *
276
+ * If we update more than one xid on this page while it is being written
277
+ * out, we might find that some of the bits go to disk and others don't.
278
+ * If we are updating commits on the page with the top-level xid that could
279
+ * break atomicity, so we subcommit the subxids first before we mark the
280
+ * top-level commit.
281
+ */
282
+ if (TransactionIdIsValid (xid ))
283
+ {
284
+ /* Subtransactions first, if needed ... */
285
+ if (status == TRANSACTION_STATUS_COMMITTED )
286
+ {
287
+ for (i = 0 ; i < nsubxids ; i ++ )
288
+ {
289
+ Assert (ClogCtl -> shared -> page_number [slotno ] == TransactionIdToPage (subxids [i ]));
290
+ TransactionIdSetStatusBit (subxids [i ],
291
+ TRANSACTION_STATUS_SUB_COMMITTED ,
292
+ lsn , slotno );
293
+ }
294
+ }
295
+
296
+ /* ... then the main transaction */
297
+ TransactionIdSetStatusBit (xid , status , lsn , slotno );
298
+ }
299
+
300
+ /* Set the subtransactions */
301
+ for (i = 0 ; i < nsubxids ; i ++ )
302
+ {
303
+ Assert (ClogCtl -> shared -> page_number [slotno ] == TransactionIdToPage (subxids [i ]));
304
+ TransactionIdSetStatusBit (subxids [i ], status , lsn , slotno );
305
+ }
306
+
307
+ ClogCtl -> shared -> page_dirty [slotno ] = true;
308
+
309
+ LWLockRelease (CLogControlLock );
310
+ }
311
+
312
+ /*
313
+ * Sets the commit status of a single transaction.
314
+ *
315
+ * Must be called with CLogControlLock held
316
+ */
317
+ static void
318
+ TransactionIdSetStatusBit (TransactionId xid , XidStatus status , XLogRecPtr lsn , int slotno )
319
+ {
320
+ int byteno = TransactionIdToByte (xid );
321
+ int bshift = TransactionIdToBIndex (xid ) * CLOG_BITS_PER_XACT ;
322
+ char * byteptr ;
323
+ char byteval ;
324
+
122
325
byteptr = ClogCtl -> shared -> page_buffer [slotno ] + byteno ;
123
326
124
327
/* Current state should be 0, subcommitted or target state */
@@ -132,8 +335,6 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
132
335
byteval |= (status << bshift );
133
336
* byteptr = byteval ;
134
337
135
- ClogCtl -> shared -> page_dirty [slotno ] = true;
136
-
137
338
/*
138
339
* Update the group LSN if the transaction completion LSN is higher.
139
340
*
@@ -149,8 +350,6 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
149
350
if (XLByteLT (ClogCtl -> shared -> group_lsn [lsnindex ], lsn ))
150
351
ClogCtl -> shared -> group_lsn [lsnindex ] = lsn ;
151
352
}
152
-
153
- LWLockRelease (CLogControlLock );
154
353
}
155
354
156
355
/*
0 commit comments