Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 16b1fe0

Browse files
author
Amit Kapila
committed
Fix assertion failures while processing NEW_CID record in logical decoding.
When the logical decoding restarts from NEW_CID, since there is no association between the top transaction and its subtransaction, both are created as top transactions and have the same LSN. This caused the assertion failure in AssertTXNLsnOrder(). This patch skips the assertion check until we reach the LSN at which we start decoding the contents of the transaction, specifically start_decoding_at LSN in SnapBuild. This is okay because we don't guarantee to make the association between top transaction and subtransaction until we try to decode the actual contents of transaction. The ordering of the records prior to the start_decoding_at LSN should have been checked before the restart. The other assertion failure is due to the reason that we forgot to track that we have considered top-level transaction id in the list of catalog changing transactions that were committed when one of its subtransactions is marked as containing catalog change. Reported-by: Tomas Vondra, Osumi Takamichi Author: Masahiko Sawada, Kuroda Hayato Reviewed-by: Amit Kapila, Dilip Kumar, Kuroda Hayato, Kyotaro Horiguchi, Masahiko Sawada Backpatch-through: 10 Discussion: https://postgr.es/m/a89b46b6-0239-2fd5-71a9-b19b1f7a7145%40enterprisedb.com Discussion: https://postgr.es/m/TYCPR01MB83733C6CEAE47D0280814D5AED7A9%40TYCPR01MB8373.jpnprd01.prod.outlook.com
1 parent 460c007 commit 16b1fe0

File tree

4 files changed

+78
-0
lines changed

4 files changed

+78
-0
lines changed

contrib/test_decoding/expected/catalog_change_snapshot.out

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,48 @@ COMMIT
9696
stop
9797
(1 row)
9898

99+
100+
starting permutation: s0_init s0_begin s0_savepoint s0_insert s1_checkpoint s1_get_changes s0_insert2 s0_commit s0_begin s0_insert s1_checkpoint s1_get_changes s0_commit s1_get_changes
101+
step s0_init: SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding');
102+
?column?
103+
--------
104+
init
105+
(1 row)
106+
107+
step s0_begin: BEGIN;
108+
step s0_savepoint: SAVEPOINT sp1;
109+
step s0_insert: INSERT INTO tbl1 VALUES (1);
110+
step s1_checkpoint: CHECKPOINT;
111+
step s1_get_changes: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'skip-empty-xacts', '1', 'include-xids', '0');
112+
data
113+
----
114+
(0 rows)
115+
116+
step s0_insert2: INSERT INTO user_cat VALUES (1);
117+
step s0_commit: COMMIT;
118+
step s0_begin: BEGIN;
119+
step s0_insert: INSERT INTO tbl1 VALUES (1);
120+
step s1_checkpoint: CHECKPOINT;
121+
step s1_get_changes: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'skip-empty-xacts', '1', 'include-xids', '0');
122+
data
123+
-------------------------------------------------------------
124+
BEGIN
125+
table public.tbl1: INSERT: val1[integer]:1 val2[integer]:null
126+
table public.user_cat: INSERT: val1[integer]:1
127+
COMMIT
128+
(4 rows)
129+
130+
step s0_commit: COMMIT;
131+
step s1_get_changes: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'skip-empty-xacts', '1', 'include-xids', '0');
132+
data
133+
-------------------------------------------------------------
134+
BEGIN
135+
table public.tbl1: INSERT: val1[integer]:1 val2[integer]:null
136+
COMMIT
137+
(3 rows)
138+
139+
?column?
140+
--------
141+
stop
142+
(1 row)
143+

contrib/test_decoding/specs/catalog_change_snapshot.spec

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@ setup
66
DROP TABLE IF EXISTS tbl2;
77
CREATE TABLE tbl1 (val1 integer, val2 integer);
88
CREATE TABLE tbl2 (val1 integer, val2 integer);
9+
CREATE TABLE user_cat (val1 integer) WITH (user_catalog_table = true);
910
}
1011

1112
teardown
1213
{
1314
DROP TABLE tbl1;
1415
DROP TABLE tbl2;
16+
DROP TABLE user_cat;
1517
SELECT 'stop' FROM pg_drop_replication_slot('isolation_slot');
1618
}
1719

@@ -22,6 +24,7 @@ step "s0_begin" { BEGIN; }
2224
step "s0_savepoint" { SAVEPOINT sp1; }
2325
step "s0_truncate" { TRUNCATE tbl1; }
2426
step "s0_insert" { INSERT INTO tbl1 VALUES (1); }
27+
step "s0_insert2" { INSERT INTO user_cat VALUES (1); }
2528
step "s0_commit" { COMMIT; }
2629

2730
session "s1"
@@ -57,3 +60,16 @@ permutation "s0_init" "s0_begin" "s0_savepoint" "s0_truncate" "s1_checkpoint" "s
5760
# checkpoint record it prunes one of the xacts in that list and when decoding the
5861
# next checkpoint, it will completely prune that list.
5962
permutation "s0_init" "s0_begin" "s0_truncate" "s2_begin" "s2_truncate" "s1_checkpoint" "s1_get_changes" "s0_commit" "s0_begin" "s0_insert" "s1_checkpoint" "s1_get_changes" "s2_commit" "s1_checkpoint" "s1_get_changes" "s0_commit" "s1_get_changes"
63+
64+
# Test that we can handle the case where there is no association between top-level
65+
# transaction and its subtransactions. The last decoding restarts from the first
66+
# checkpoint, decodes NEW_CID generated by "s0_insert2", and marks the subtransaction
67+
# as containing catalog changes while adding tuple cids to its top-level transaction.
68+
# During that, both transaction entries are created in ReorderBuffer as top-level
69+
# transactions and have the same LSN. We check if the assertion check for the order
70+
# of transaction LSNs in AssertTXNLsnOrder() is skipped since we are still before the
71+
# LSN at which we start replaying the contents of transactions. Besides, when decoding
72+
# the commit record of the top-level transaction, we must force the top-level
73+
# transaction to do timetravel since one of its subtransactions has been marked as
74+
# containing catalog changes.
75+
permutation "s0_init" "s0_begin" "s0_savepoint" "s0_insert" "s1_checkpoint" "s1_get_changes" "s0_insert2" "s0_commit" "s0_begin" "s0_insert" "s1_checkpoint" "s1_get_changes" "s0_commit" "s1_get_changes"

src/backend/replication/logical/reorderbuffer.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,10 +881,24 @@ static void
881881
AssertTXNLsnOrder(ReorderBuffer *rb)
882882
{
883883
#ifdef USE_ASSERT_CHECKING
884+
LogicalDecodingContext *ctx = rb->private_data;
884885
dlist_iter iter;
885886
XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
886887
XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
887888

889+
/*
890+
* Skip the verification if we don't reach the LSN at which we start
891+
* decoding the contents of transactions yet because until we reach the
892+
* LSN, we could have transactions that don't have the association between
893+
* the top-level transaction and subtransaction yet and consequently have
894+
* the same LSN. We don't guarantee this association until we try to
895+
* decode the actual contents of transaction. The ordering of the records
896+
* prior to the start_decoding_at LSN should have been checked before the
897+
* restart.
898+
*/
899+
if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
900+
return;
901+
888902
dlist_foreach(iter, &rb->toplevel_by_lsn)
889903
{
890904
ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,

src/backend/replication/logical/snapbuild.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,9 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid,
10991099
else if (sub_needs_timetravel)
11001100
{
11011101
/* track toplevel txn as well, subxact alone isn't meaningful */
1102+
elog(DEBUG2, "forced transaction %u to do timetravel due to one of its subtransactions",
1103+
xid);
1104+
needs_timetravel = true;
11021105
SnapBuildAddCommittedTxn(builder, xid);
11031106
}
11041107
else if (needs_timetravel)

0 commit comments

Comments
 (0)