Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 6db9928

Browse files
committed
Prevent excess SimpleLruTruncate() deletion.
Every core SLRU wraps around. With the exception of pg_notify, the wrap point can fall in the middle of a page. Account for this in the PagePrecedes callback specification and in SimpleLruTruncate()'s use of said callback. Update each callback implementation to fit the new specification. This changes SerialPagePrecedesLogically() from the style of asyncQueuePagePrecedes() to the style of CLOGPagePrecedes(). (Whereas pg_clog and pg_serial share a key space, pg_serial is nothing like pg_notify.) The bug fixed here has the same symptoms and user followup steps as 592a589. Back-patch to 9.5 (all supported versions). Reviewed by Andrey Borodin and (in earlier versions) by Tom Lane. Discussion: https://postgr.es/m/20190202083822.GC32531@gust.leadboat.com
1 parent c95765f commit 6db9928

File tree

8 files changed

+314
-80
lines changed

8 files changed

+314
-80
lines changed

src/backend/access/transam/clog.c

+19-8
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,7 @@ CLOGShmemInit(void)
694694
SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
695695
XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
696696
SYNC_HANDLER_CLOG);
697+
SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
697698
}
698699

699700
/*
@@ -912,13 +913,22 @@ TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid)
912913

913914

914915
/*
915-
* Decide which of two CLOG page numbers is "older" for truncation purposes.
916+
* Decide whether a CLOG page number is "older" for truncation purposes.
916917
*
917918
* We need to use comparison of TransactionIds here in order to do the right
918-
* thing with wraparound XID arithmetic. However, if we are asked about
919-
* page number zero, we don't want to hand InvalidTransactionId to
920-
* TransactionIdPrecedes: it'll get weird about permanent xact IDs. So,
921-
* offset both xids by FirstNormalTransactionId to avoid that.
919+
* thing with wraparound XID arithmetic. However, TransactionIdPrecedes()
920+
* would get weird about permanent xact IDs. So, offset both such that xid1,
921+
* xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset
922+
* is relevant to page 0 and to the page preceding page 0.
923+
*
924+
* The page containing oldestXact-2^31 is the important edge case. The
925+
* portion of that page equaling or following oldestXact-2^31 is expendable,
926+
* but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is
927+
* the first XID of a page and segment, the entire page and segment is
928+
* expendable, and we could truncate the segment. Recognizing that case would
929+
* require making oldestXact, not just the page containing oldestXact,
930+
* available to this callback. The benefit would be rare and small, so we
931+
* don't optimize that edge case.
922932
*/
923933
static bool
924934
CLOGPagePrecedes(int page1, int page2)
@@ -927,11 +937,12 @@ CLOGPagePrecedes(int page1, int page2)
927937
TransactionId xid2;
928938

929939
xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
930-
xid1 += FirstNormalTransactionId;
940+
xid1 += FirstNormalTransactionId + 1;
931941
xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
932-
xid2 += FirstNormalTransactionId;
942+
xid2 += FirstNormalTransactionId + 1;
933943

934-
return TransactionIdPrecedes(xid1, xid2);
944+
return (TransactionIdPrecedes(xid1, xid2) &&
945+
TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1));
935946
}
936947

937948

src/backend/access/transam/commit_ts.c

+25-10
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,7 @@ CommitTsShmemInit(void)
557557
CommitTsSLRULock, "pg_commit_ts",
558558
LWTRANCHE_COMMITTS_BUFFER,
559559
SYNC_HANDLER_COMMIT_TS);
560+
SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE);
560561

561562
commitTsShared = ShmemInitStruct("CommitTs shared",
562563
sizeof(CommitTimestampShared),
@@ -927,14 +928,27 @@ AdvanceOldestCommitTsXid(TransactionId oldestXact)
927928

928929

929930
/*
930-
* Decide which of two commitTS page numbers is "older" for truncation
931-
* purposes.
931+
* Decide whether a commitTS page number is "older" for truncation purposes.
932+
* Analogous to CLOGPagePrecedes().
932933
*
933-
* We need to use comparison of TransactionIds here in order to do the right
934-
* thing with wraparound XID arithmetic. However, if we are asked about
935-
* page number zero, we don't want to hand InvalidTransactionId to
936-
* TransactionIdPrecedes: it'll get weird about permanent xact IDs. So,
937-
* offset both xids by FirstNormalTransactionId to avoid that.
934+
* At default BLCKSZ, (1 << 31) % COMMIT_TS_XACTS_PER_PAGE == 128. This
935+
* introduces differences compared to CLOG and the other SLRUs having (1 <<
936+
* 31) % per_page == 0. This function never tests exactly
937+
* TransactionIdPrecedes(x-2^31, x). When the system reaches xidStopLimit,
938+
* there are two possible counts of page boundaries between oldestXact and the
939+
* latest XID assigned, depending on whether oldestXact is within the first
940+
* 128 entries of its page. Since this function doesn't know the location of
941+
* oldestXact within page2, it returns false for one page that actually is
942+
* expendable. This is a wider (yet still negligible) version of the
943+
* truncation opportunity that CLOGPagePrecedes() cannot recognize.
944+
*
945+
* For the sake of a worked example, number entries with decimal values such
946+
* that page1==1 entries range from 1.0 to 1.999. Let N+0.15 be the number of
947+
* pages that 2^31 entries will span (N is an integer). If oldestXact=N+2.1,
948+
* then the final safe XID assignment leaves newestXact=1.95. We keep page 2,
949+
* because entry=2.85 is the border that toggles whether entries precede the
950+
* last entry of the oldestXact page. While page 2 is expendable at
951+
* oldestXact=N+2.1, it would be precious at oldestXact=N+2.9.
938952
*/
939953
static bool
940954
CommitTsPagePrecedes(int page1, int page2)
@@ -943,11 +957,12 @@ CommitTsPagePrecedes(int page1, int page2)
943957
TransactionId xid2;
944958

945959
xid1 = ((TransactionId) page1) * COMMIT_TS_XACTS_PER_PAGE;
946-
xid1 += FirstNormalTransactionId;
960+
xid1 += FirstNormalTransactionId + 1;
947961
xid2 = ((TransactionId) page2) * COMMIT_TS_XACTS_PER_PAGE;
948-
xid2 += FirstNormalTransactionId;
962+
xid2 += FirstNormalTransactionId + 1;
949963

950-
return TransactionIdPrecedes(xid1, xid2);
964+
return (TransactionIdPrecedes(xid1, xid2) &&
965+
TransactionIdPrecedes(xid1, xid2 + COMMIT_TS_XACTS_PER_PAGE - 1));
951966
}
952967

953968

src/backend/access/transam/multixact.c

+23-13
Original file line numberDiff line numberDiff line change
@@ -1852,11 +1852,13 @@ MultiXactShmemInit(void)
18521852
MultiXactOffsetSLRULock, "pg_multixact/offsets",
18531853
LWTRANCHE_MULTIXACTOFFSET_BUFFER,
18541854
SYNC_HANDLER_MULTIXACT_OFFSET);
1855+
SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
18551856
SimpleLruInit(MultiXactMemberCtl,
18561857
"MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
18571858
MultiXactMemberSLRULock, "pg_multixact/members",
18581859
LWTRANCHE_MULTIXACTMEMBER_BUFFER,
18591860
SYNC_HANDLER_MULTIXACT_MEMBER);
1861+
/* doesn't call SimpleLruTruncate() or meet criteria for unit tests */
18601862

18611863
/* Initialize our shared state struct */
18621864
MultiXactState = ShmemInitStruct("Shared MultiXact State",
@@ -2982,6 +2984,14 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
29822984
* truncate the members SLRU. So we first scan the directory to determine
29832985
* the earliest offsets page number that we can read without error.
29842986
*
2987+
* When nextMXact is less than one segment away from multiWrapLimit,
2988+
* SlruScanDirCbFindEarliest can find some early segment other than the
2989+
* actual earliest. (MultiXactOffsetPagePrecedes(EARLIEST, LATEST)
2990+
* returns false, because not all pairs of entries have the same answer.)
2991+
* That can also arise when an earlier truncation attempt failed unlink()
2992+
* or returned early from this function. The only consequence is
2993+
* returning early, which wastes space that we could have liberated.
2994+
*
29852995
* NB: It's also possible that the page that oldestMulti is on has already
29862996
* been truncated away, and we crashed before updating oldestMulti.
29872997
*/
@@ -3096,15 +3106,11 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30963106
}
30973107

30983108
/*
3099-
* Decide which of two MultiXactOffset page numbers is "older" for truncation
3100-
* purposes.
3109+
* Decide whether a MultiXactOffset page number is "older" for truncation
3110+
* purposes. Analogous to CLOGPagePrecedes().
31013111
*
3102-
* We need to use comparison of MultiXactId here in order to do the right
3103-
* thing with wraparound. However, if we are asked about page number zero, we
3104-
* don't want to hand InvalidMultiXactId to MultiXactIdPrecedes: it'll get
3105-
* weird. So, offset both multis by FirstMultiXactId to avoid that.
3106-
* (Actually, the current implementation doesn't do anything weird with
3107-
* InvalidMultiXactId, but there's no harm in leaving this code like this.)
3112+
* Offsetting the values is optional, because MultiXactIdPrecedes() has
3113+
* translational symmetry.
31083114
*/
31093115
static bool
31103116
MultiXactOffsetPagePrecedes(int page1, int page2)
@@ -3113,15 +3119,17 @@ MultiXactOffsetPagePrecedes(int page1, int page2)
31133119
MultiXactId multi2;
31143120

31153121
multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
3116-
multi1 += FirstMultiXactId;
3122+
multi1 += FirstMultiXactId + 1;
31173123
multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
3118-
multi2 += FirstMultiXactId;
3124+
multi2 += FirstMultiXactId + 1;
31193125

3120-
return MultiXactIdPrecedes(multi1, multi2);
3126+
return (MultiXactIdPrecedes(multi1, multi2) &&
3127+
MultiXactIdPrecedes(multi1,
3128+
multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1));
31213129
}
31223130

31233131
/*
3124-
* Decide which of two MultiXactMember page numbers is "older" for truncation
3132+
* Decide whether a MultiXactMember page number is "older" for truncation
31253133
* purposes. There is no "invalid offset number" so use the numbers verbatim.
31263134
*/
31273135
static bool
@@ -3133,7 +3141,9 @@ MultiXactMemberPagePrecedes(int page1, int page2)
31333141
offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
31343142
offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
31353143

3136-
return MultiXactOffsetPrecedes(offset1, offset2);
3144+
return (MultiXactOffsetPrecedes(offset1, offset2) &&
3145+
MultiXactOffsetPrecedes(offset1,
3146+
offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1));
31373147
}
31383148

31393149
/*

src/backend/access/transam/slru.c

+127-16
Original file line numberDiff line numberDiff line change
@@ -1230,11 +1230,6 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
12301230
/* update the stats counter of truncates */
12311231
pgstat_count_slru_truncate(shared->slru_stats_idx);
12321232

1233-
/*
1234-
* The cutoff point is the start of the segment containing cutoffPage.
1235-
*/
1236-
cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
1237-
12381233
/*
12391234
* Scan shared memory and remove any pages preceding the cutoff page, to
12401235
* ensure we won't rewrite them later. (Since this is normally called in
@@ -1247,9 +1242,7 @@ restart:;
12471242

12481243
/*
12491244
* While we are holding the lock, make an important safety check: the
1250-
* planned cutoff point must be <= the current endpoint page. Otherwise we
1251-
* have already wrapped around, and proceeding with the truncation would
1252-
* risk removing the current segment.
1245+
* current endpoint page must not be eligible for removal.
12531246
*/
12541247
if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
12551248
{
@@ -1281,8 +1274,11 @@ restart:;
12811274
* Hmm, we have (or may have) I/O operations acting on the page, so
12821275
* we've got to wait for them to finish and then start again. This is
12831276
* the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
1284-
* wouldn't it be OK to just discard it without writing it? For now,
1285-
* keep the logic the same as it was.)
1277+
* wouldn't it be OK to just discard it without writing it?
1278+
* SlruMayDeleteSegment() uses a stricter qualification, so we might
1279+
* not delete this page in the end; even if we don't delete it, we
1280+
* won't have cause to read its data again. For now, keep the logic
1281+
* the same as it was.)
12861282
*/
12871283
if (shared->page_status[slotno] == SLRU_PAGE_VALID)
12881284
SlruInternalWritePage(ctl, slotno, NULL);
@@ -1377,19 +1373,134 @@ SlruDeleteSegment(SlruCtl ctl, int segno)
13771373
LWLockRelease(shared->ControlLock);
13781374
}
13791375

1376+
/*
1377+
* Determine whether a segment is okay to delete.
1378+
*
1379+
* segpage is the first page of the segment, and cutoffPage is the oldest (in
1380+
* PagePrecedes order) page in the SLRU containing still-useful data. Since
1381+
* every core PagePrecedes callback implements "wrap around", check the
1382+
* segment's first and last pages:
1383+
*
1384+
* first<cutoff && last<cutoff: yes
1385+
* first<cutoff && last>=cutoff: no; cutoff falls inside this segment
1386+
* first>=cutoff && last<cutoff: no; wrap point falls inside this segment
1387+
* first>=cutoff && last>=cutoff: no; every page of this segment is too young
1388+
*/
1389+
static bool
1390+
SlruMayDeleteSegment(SlruCtl ctl, int segpage, int cutoffPage)
1391+
{
1392+
int seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1;
1393+
1394+
Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0);
1395+
1396+
return (ctl->PagePrecedes(segpage, cutoffPage) &&
1397+
ctl->PagePrecedes(seg_last_page, cutoffPage));
1398+
}
1399+
1400+
#ifdef USE_ASSERT_CHECKING
1401+
static void
1402+
SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset)
1403+
{
1404+
TransactionId lhs,
1405+
rhs;
1406+
int newestPage,
1407+
oldestPage;
1408+
TransactionId newestXact,
1409+
oldestXact;
1410+
1411+
/*
1412+
* Compare an XID pair having undefined order (see RFC 1982), a pair at
1413+
* "opposite ends" of the XID space. TransactionIdPrecedes() treats each
1414+
* as preceding the other. If RHS is oldestXact, LHS is the first XID we
1415+
* must not assign.
1416+
*/
1417+
lhs = per_page + offset; /* skip first page to avoid non-normal XIDs */
1418+
rhs = lhs + (1U << 31);
1419+
Assert(TransactionIdPrecedes(lhs, rhs));
1420+
Assert(TransactionIdPrecedes(rhs, lhs));
1421+
Assert(!TransactionIdPrecedes(lhs - 1, rhs));
1422+
Assert(TransactionIdPrecedes(rhs, lhs - 1));
1423+
Assert(TransactionIdPrecedes(lhs + 1, rhs));
1424+
Assert(!TransactionIdPrecedes(rhs, lhs + 1));
1425+
Assert(!TransactionIdFollowsOrEquals(lhs, rhs));
1426+
Assert(!TransactionIdFollowsOrEquals(rhs, lhs));
1427+
Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page));
1428+
Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page));
1429+
Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page));
1430+
Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page));
1431+
Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page));
1432+
Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page));
1433+
Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page)
1434+
|| (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */
1435+
Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page)
1436+
|| (1U << 31) % per_page != 0);
1437+
Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page));
1438+
Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page));
1439+
Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page));
1440+
1441+
/*
1442+
* GetNewTransactionId() has assigned the last XID it can safely use, and
1443+
* that XID is in the *LAST* page of the second segment. We must not
1444+
* delete that segment.
1445+
*/
1446+
newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1;
1447+
newestXact = newestPage * per_page + offset;
1448+
Assert(newestXact / per_page == newestPage);
1449+
oldestXact = newestXact + 1;
1450+
oldestXact -= 1U << 31;
1451+
oldestPage = oldestXact / per_page;
1452+
Assert(!SlruMayDeleteSegment(ctl,
1453+
(newestPage -
1454+
newestPage % SLRU_PAGES_PER_SEGMENT),
1455+
oldestPage));
1456+
1457+
/*
1458+
* GetNewTransactionId() has assigned the last XID it can safely use, and
1459+
* that XID is in the *FIRST* page of the second segment. We must not
1460+
* delete that segment.
1461+
*/
1462+
newestPage = SLRU_PAGES_PER_SEGMENT;
1463+
newestXact = newestPage * per_page + offset;
1464+
Assert(newestXact / per_page == newestPage);
1465+
oldestXact = newestXact + 1;
1466+
oldestXact -= 1U << 31;
1467+
oldestPage = oldestXact / per_page;
1468+
Assert(!SlruMayDeleteSegment(ctl,
1469+
(newestPage -
1470+
newestPage % SLRU_PAGES_PER_SEGMENT),
1471+
oldestPage));
1472+
}
1473+
1474+
/*
1475+
* Unit-test a PagePrecedes function.
1476+
*
1477+
* This assumes every uint32 >= FirstNormalTransactionId is a valid key. It
1478+
* assumes each value occupies a contiguous, fixed-size region of SLRU bytes.
1479+
* (MultiXactMemberCtl separates flags from XIDs. AsyncCtl has
1480+
* variable-length entries, no keys, and no random access. These unit tests
1481+
* do not apply to them.)
1482+
*/
1483+
void
1484+
SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page)
1485+
{
1486+
/* Test first, middle and last entries of a page. */
1487+
SlruPagePrecedesTestOffset(ctl, per_page, 0);
1488+
SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2);
1489+
SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1);
1490+
}
1491+
#endif
1492+
13801493
/*
13811494
* SlruScanDirectory callback
1382-
* This callback reports true if there's any segment prior to the one
1383-
* containing the page passed as "data".
1495+
* This callback reports true if there's any segment wholly prior to the
1496+
* one containing the page passed as "data".
13841497
*/
13851498
bool
13861499
SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data)
13871500
{
13881501
int cutoffPage = *(int *) data;
13891502

1390-
cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
1391-
1392-
if (ctl->PagePrecedes(segpage, cutoffPage))
1503+
if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
13931504
return true; /* found one; don't iterate any more */
13941505

13951506
return false; /* keep going */
@@ -1404,7 +1515,7 @@ SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data)
14041515
{
14051516
int cutoffPage = *(int *) data;
14061517

1407-
if (ctl->PagePrecedes(segpage, cutoffPage))
1518+
if (SlruMayDeleteSegment(ctl, segpage, cutoffPage))
14081519
SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT);
14091520

14101521
return false; /* keep going */

0 commit comments

Comments
 (0)