Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit a4ccc1c

Browse files
Generational memory allocator
Add new style of memory allocator, known as Generational appropriate for use in cases where memory is allocated and then freed in roughly oldest first order (FIFO). Use new allocator for logical decoding’s reorderbuffer to significantly reduce memory usage and improve performance. Author: Tomas Vondra Reviewed-by: Simon Riggs
1 parent 3bae43c commit a4ccc1c

File tree

8 files changed

+819
-79
lines changed

8 files changed

+819
-79
lines changed

src/backend/replication/logical/reorderbuffer.c

+17-63
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,12 @@
4343
* transaction there will be no other data carrying records between a row's
4444
* toast chunks and the row data itself. See ReorderBufferToast* for
4545
* details.
46+
*
47+
* ReorderBuffer uses two special memory context types - SlabContext for
48+
* allocations of fixed-length structures (changes and transactions), and
49+
* GenerationContext for the variable-length transaction data (allocated
50+
* and freed in groups with similar lifespan).
51+
*
4652
* -------------------------------------------------------------------------
4753
*/
4854
#include "postgres.h"
@@ -150,15 +156,6 @@ typedef struct ReorderBufferDiskChange
150156
*/
151157
static const Size max_changes_in_memory = 4096;
152158

153-
/*
154-
* We use a very simple form of a slab allocator for frequently allocated
155-
* objects, simply keeping a fixed number in a linked list when unused,
156-
* instead pfree()ing them. Without that in many workloads aset.c becomes a
157-
* major bottleneck, especially when spilling to disk while decoding batch
158-
* workloads.
159-
*/
160-
static const Size max_cached_tuplebufs = 4096 * 2; /* ~8MB */
161-
162159
/* ---------------------------------------
163160
* primary reorderbuffer support routines
164161
* ---------------------------------------
@@ -248,6 +245,10 @@ ReorderBufferAllocate(void)
248245
SLAB_DEFAULT_BLOCK_SIZE,
249246
sizeof(ReorderBufferTXN));
250247

248+
buffer->tup_context = GenerationContextCreate(new_ctx,
249+
"Tuples",
250+
SLAB_LARGE_BLOCK_SIZE);
251+
251252
hash_ctl.keysize = sizeof(TransactionId);
252253
hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
253254
hash_ctl.hcxt = buffer->context;
@@ -258,15 +259,12 @@ ReorderBufferAllocate(void)
258259
buffer->by_txn_last_xid = InvalidTransactionId;
259260
buffer->by_txn_last_txn = NULL;
260261

261-
buffer->nr_cached_tuplebufs = 0;
262-
263262
buffer->outbuf = NULL;
264263
buffer->outbufsize = 0;
265264

266265
buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
267266

268267
dlist_init(&buffer->toplevel_by_lsn);
269-
slist_init(&buffer->cached_tuplebufs);
270268

271269
return buffer;
272270
}
@@ -419,42 +417,12 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
419417

420418
alloc_len = tuple_len + SizeofHeapTupleHeader;
421419

422-
/*
423-
* Most tuples are below MaxHeapTupleSize, so we use a slab allocator for
424-
* those. Thus always allocate at least MaxHeapTupleSize. Note that tuples
425-
* generated for oldtuples can be bigger, as they don't have out-of-line
426-
* toast columns.
427-
*/
428-
if (alloc_len < MaxHeapTupleSize)
429-
alloc_len = MaxHeapTupleSize;
430-
431-
432-
/* if small enough, check the slab cache */
433-
if (alloc_len <= MaxHeapTupleSize && rb->nr_cached_tuplebufs)
434-
{
435-
rb->nr_cached_tuplebufs--;
436-
tuple = slist_container(ReorderBufferTupleBuf, node,
437-
slist_pop_head_node(&rb->cached_tuplebufs));
438-
Assert(tuple->alloc_tuple_size == MaxHeapTupleSize);
439-
#ifdef USE_ASSERT_CHECKING
440-
memset(&tuple->tuple, 0xa9, sizeof(HeapTupleData));
441-
VALGRIND_MAKE_MEM_UNDEFINED(&tuple->tuple, sizeof(HeapTupleData));
442-
#endif
443-
tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
444-
#ifdef USE_ASSERT_CHECKING
445-
memset(tuple->tuple.t_data, 0xa8, tuple->alloc_tuple_size);
446-
VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
447-
#endif
448-
}
449-
else
450-
{
451-
tuple = (ReorderBufferTupleBuf *)
452-
MemoryContextAlloc(rb->context,
453-
sizeof(ReorderBufferTupleBuf) +
454-
MAXIMUM_ALIGNOF + alloc_len);
455-
tuple->alloc_tuple_size = alloc_len;
456-
tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
457-
}
420+
tuple = (ReorderBufferTupleBuf *)
421+
MemoryContextAlloc(rb->tup_context,
422+
sizeof(ReorderBufferTupleBuf) +
423+
MAXIMUM_ALIGNOF + alloc_len);
424+
tuple->alloc_tuple_size = alloc_len;
425+
tuple->tuple.t_data = ReorderBufferTupleBufData(tuple);
458426

459427
return tuple;
460428
}
@@ -468,21 +436,7 @@ ReorderBufferGetTupleBuf(ReorderBuffer *rb, Size tuple_len)
468436
void
469437
ReorderBufferReturnTupleBuf(ReorderBuffer *rb, ReorderBufferTupleBuf *tuple)
470438
{
471-
/* check whether to put into the slab cache, oversized tuples never are */
472-
if (tuple->alloc_tuple_size == MaxHeapTupleSize &&
473-
rb->nr_cached_tuplebufs < max_cached_tuplebufs)
474-
{
475-
rb->nr_cached_tuplebufs++;
476-
slist_push_head(&rb->cached_tuplebufs, &tuple->node);
477-
VALGRIND_MAKE_MEM_UNDEFINED(tuple->tuple.t_data, tuple->alloc_tuple_size);
478-
VALGRIND_MAKE_MEM_UNDEFINED(tuple, sizeof(ReorderBufferTupleBuf));
479-
VALGRIND_MAKE_MEM_DEFINED(&tuple->node, sizeof(tuple->node));
480-
VALGRIND_MAKE_MEM_DEFINED(&tuple->alloc_tuple_size, sizeof(tuple->alloc_tuple_size));
481-
}
482-
else
483-
{
484-
pfree(tuple);
485-
}
439+
pfree(tuple);
486440
}
487441

488442
/*

src/backend/utils/mmgr/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mmgr
1212
top_builddir = ../../../..
1313
include $(top_builddir)/src/Makefile.global
1414

15-
OBJS = aset.o dsa.o freepage.o mcxt.o memdebug.o portalmem.o slab.o
15+
OBJS = aset.o dsa.o freepage.o generation.o mcxt.o memdebug.o portalmem.o slab.o
1616

1717
include $(top_srcdir)/src/backend/common.mk

src/backend/utils/mmgr/README

+23
Original file line numberDiff line numberDiff line change
@@ -431,3 +431,26 @@ will not allocate very much space per tuple cycle. To make this usage
431431
pattern cheap, the first block allocated in a context is not given
432432
back to malloc() during reset, but just cleared. This avoids malloc
433433
thrashing.
434+
435+
436+
Alternative Memory Context Implementations
437+
------------------------------------------
438+
439+
aset.c is our default general-purpose implementation, working fine
440+
in most situations. We also have two implementations optimized for
441+
special use cases, providing either better performance or lower memory
442+
usage compared to aset.c (or both).
443+
444+
* slab.c (SlabContext) is designed for allocations of fixed-length
445+
chunks, and does not allow allocations of chunks with different size.
446+
447+
* generation.c (GenerationContext) is designed for cases when chunks
448+
are allocated in groups with similar lifespan (generations), or
449+
roughly in FIFO order.
450+
451+
Both memory contexts aim to free memory back to the operating system
452+
(unlike aset.c, which keeps the freed chunks in a freelist, and only
453+
returns the memory when reset/deleted).
454+
455+
These memory contexts were initially developed for ReorderBuffer, but
456+
may be useful elsewhere as long as the allocation patterns match.

0 commit comments

Comments
 (0)