Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit ac88d29

Browse files
author
Amit Kapila
committed
Avoid creation of the free space map for small heap relations.
Previously, all heaps had FSMs. For very small tables, this means that the FSM took up more space than the heap did. This is wasteful, so now we refrain from creating the FSM for heaps with 4 pages or fewer. If the last known target block has insufficient space, we still try to insert into some other page before giving up and extending the relation, since doing otherwise leads to table bloat. Testing showed that trying every page penalized performance slightly, so we compromise and try every other page. This way, we visit at most two pages. Any pages with wasted free space become visible at next relation extension, so we still control table bloat. As a bonus, directly attempting one or two pages can even be faster than consulting the FSM would have been. Once the FSM is created for a heap we don't remove it even if somebody deletes all the rows from the corresponding relation. We don't think it is a useful optimization as it is quite likely that relation will again grow to the same size. Author: John Naylor with design inputs and some code contribution by Amit Kapila Reviewed-by: Amit Kapila Tested-by: Mithun C Y Discussion: https://www.postgresql.org/message-id/CAJVSVGWvB13PzpbLEecFuGFc5V2fsO736BsdTakPiPAcdMM5tQ@mail.gmail.com
1 parent d66e366 commit ac88d29

File tree

16 files changed

+576
-102
lines changed

16 files changed

+576
-102
lines changed

contrib/pageinspect/expected/page.out

+39-38
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,69 @@
11
CREATE EXTENSION pageinspect;
2-
CREATE TABLE test1 (a int, b int);
3-
INSERT INTO test1 VALUES (16777217, 131584);
4-
VACUUM test1; -- set up FSM
2+
CREATE TABLE test_rel_forks (a int);
3+
-- Make sure there are enough blocks in the heap for the FSM to be created.
4+
INSERT INTO test_rel_forks SELECT i from generate_series(1,1000) i;
5+
-- set up FSM and VM
6+
VACUUM test_rel_forks;
57
-- The page contents can vary, so just test that it can be read
68
-- successfully, but don't keep the output.
7-
SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
9+
SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
810
main_0
911
--------
1012
8192
1113
(1 row)
1214

13-
SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
14-
ERROR: block number 1 is out of range for relation "test1"
15-
SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
15+
SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
16+
ERROR: block number 100 is out of range for relation "test_rel_forks"
17+
SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
1618
fsm_0
1719
-------
1820
8192
1921
(1 row)
2022

21-
SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
22-
fsm_1
23-
-------
24-
8192
25-
(1 row)
26-
27-
SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
23+
SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10;
24+
ERROR: block number 10 is out of range for relation "test_rel_forks"
25+
SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
2826
vm_0
2927
------
3028
8192
3129
(1 row)
3230

33-
SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
34-
ERROR: block number 1 is out of range for relation "test1"
31+
SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
32+
ERROR: block number 1 is out of range for relation "test_rel_forks"
3533
SELECT octet_length(get_raw_page('xxx', 'main', 0));
3634
ERROR: relation "xxx" does not exist
37-
SELECT octet_length(get_raw_page('test1', 'xxx', 0));
35+
SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
3836
ERROR: invalid fork name
3937
HINT: Valid fork names are "main", "fsm", "vm", and "init".
40-
SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
38+
SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
39+
fsm_page_contents
40+
-------------------
41+
0: 147 +
42+
1: 147 +
43+
3: 147 +
44+
7: 147 +
45+
15: 147 +
46+
31: 147 +
47+
63: 147 +
48+
127: 147 +
49+
255: 147 +
50+
511: 147 +
51+
1023: 147 +
52+
2047: 147 +
53+
4095: 147 +
54+
fp_next_slot: 0 +
55+
56+
(1 row)
57+
58+
SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
4159
?column?
4260
----------
4361
t
4462
(1 row)
4563

64+
DROP TABLE test_rel_forks;
65+
CREATE TABLE test1 (a int, b int);
66+
INSERT INTO test1 VALUES (16777217, 131584);
4667
SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
4768
pagesize | version
4869
----------+---------
@@ -62,26 +83,6 @@ SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bi
6283
{"\\x01000001","\\x00020200"}
6384
(1 row)
6485

65-
SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
66-
fsm_page_contents
67-
-------------------
68-
0: 254 +
69-
1: 254 +
70-
3: 254 +
71-
7: 254 +
72-
15: 254 +
73-
31: 254 +
74-
63: 254 +
75-
127: 254 +
76-
255: 254 +
77-
511: 254 +
78-
1023: 254 +
79-
2047: 254 +
80-
4095: 254 +
81-
fp_next_slot: 0 +
82-
83-
(1 row)
84-
8586
DROP TABLE test1;
8687
-- check that using any of these functions with a partitioned table or index
8788
-- would fail

contrib/pageinspect/sql/page.sql

+20-13
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,35 @@
11
CREATE EXTENSION pageinspect;
22

3-
CREATE TABLE test1 (a int, b int);
4-
INSERT INTO test1 VALUES (16777217, 131584);
3+
CREATE TABLE test_rel_forks (a int);
4+
-- Make sure there are enough blocks in the heap for the FSM to be created.
5+
INSERT INTO test_rel_forks SELECT i from generate_series(1,1000) i;
56

6-
VACUUM test1; -- set up FSM
7+
-- set up FSM and VM
8+
VACUUM test_rel_forks;
79

810
-- The page contents can vary, so just test that it can be read
911
-- successfully, but don't keep the output.
1012

11-
SELECT octet_length(get_raw_page('test1', 'main', 0)) AS main_0;
12-
SELECT octet_length(get_raw_page('test1', 'main', 1)) AS main_1;
13+
SELECT octet_length(get_raw_page('test_rel_forks', 'main', 0)) AS main_0;
14+
SELECT octet_length(get_raw_page('test_rel_forks', 'main', 100)) AS main_100;
1315

14-
SELECT octet_length(get_raw_page('test1', 'fsm', 0)) AS fsm_0;
15-
SELECT octet_length(get_raw_page('test1', 'fsm', 1)) AS fsm_1;
16+
SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 0)) AS fsm_0;
17+
SELECT octet_length(get_raw_page('test_rel_forks', 'fsm', 10)) AS fsm_10;
1618

17-
SELECT octet_length(get_raw_page('test1', 'vm', 0)) AS vm_0;
18-
SELECT octet_length(get_raw_page('test1', 'vm', 1)) AS vm_1;
19+
SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 0)) AS vm_0;
20+
SELECT octet_length(get_raw_page('test_rel_forks', 'vm', 1)) AS vm_1;
1921

2022
SELECT octet_length(get_raw_page('xxx', 'main', 0));
21-
SELECT octet_length(get_raw_page('test1', 'xxx', 0));
23+
SELECT octet_length(get_raw_page('test_rel_forks', 'xxx', 0));
24+
25+
SELECT * FROM fsm_page_contents(get_raw_page('test_rel_forks', 'fsm', 0));
26+
27+
SELECT get_raw_page('test_rel_forks', 0) = get_raw_page('test_rel_forks', 'main', 0);
2228

23-
SELECT get_raw_page('test1', 0) = get_raw_page('test1', 'main', 0);
29+
DROP TABLE test_rel_forks;
30+
31+
CREATE TABLE test1 (a int, b int);
32+
INSERT INTO test1 VALUES (16777217, 131584);
2433

2534
SELECT pagesize, version FROM page_header(get_raw_page('test1', 0));
2635

@@ -29,8 +38,6 @@ SELECT page_checksum(get_raw_page('test1', 0), 0) IS NOT NULL AS silly_checksum_
2938
SELECT tuple_data_split('test1'::regclass, t_data, t_infomask, t_infomask2, t_bits)
3039
FROM heap_page_items(get_raw_page('test1', 0));
3140

32-
SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
33-
3441
DROP TABLE test1;
3542

3643
-- check that using any of these functions with a partitioned table or index

doc/src/sgml/storage.sgml

+7-6
Original file line numberDiff line numberDiff line change
@@ -590,12 +590,13 @@ tuple would otherwise be too big.
590590
<indexterm><primary>FSM</primary><see>Free Space Map</see></indexterm>
591591

592592
<para>
593-
Each heap and index relation, except for hash indexes, has a Free Space Map
594-
(FSM) to keep track of available space in the relation. It's stored
595-
alongside the main relation data in a separate relation fork, named after the
596-
filenode number of the relation, plus a <literal>_fsm</literal> suffix. For example,
597-
if the filenode of a relation is 12345, the FSM is stored in a file called
598-
<filename>12345_fsm</filename>, in the same directory as the main relation file.
593+
Each heap relation, unless it is very small, and each index relation, except
594+
for hash indexes, has a Free Space Map (FSM) to keep track of available
595+
space in the relation. It's stored alongside the main relation data in a
596+
separate relation fork, named after the filenode number of the relation, plus
597+
a <literal>_fsm</literal> suffix. For example, if the filenode of a relation
598+
is 12345, the FSM is stored in a file called <filename>12345_fsm</filename>,
599+
in the same directory as the main relation file.
599600
</para>
600601

601602
<para>

src/backend/access/brin/brin.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1150,7 +1150,7 @@ terminate_brin_buildstate(BrinBuildState *state)
11501150
freespace = PageGetFreeSpace(page);
11511151
blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
11521152
ReleaseBuffer(state->bs_currentInsertBuf);
1153-
RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
1153+
RecordPageWithFreeSpace(state->bs_irel, blk, freespace, InvalidBlockNumber);
11541154
FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
11551155
}
11561156

src/backend/access/brin/brin_pageops.c

+5-5
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
310310

311311
if (extended)
312312
{
313-
RecordPageWithFreeSpace(idxrel, newblk, freespace);
313+
RecordPageWithFreeSpace(idxrel, newblk, freespace, InvalidBlockNumber);
314314
FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
315315
}
316316

@@ -461,7 +461,7 @@ brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
461461

462462
if (extended)
463463
{
464-
RecordPageWithFreeSpace(idxrel, blk, freespace);
464+
RecordPageWithFreeSpace(idxrel, blk, freespace, InvalidBlockNumber);
465465
FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
466466
}
467467

@@ -654,7 +654,7 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
654654

655655
/* Measure free space and record it */
656656
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
657-
br_page_get_freespace(page));
657+
br_page_get_freespace(page), InvalidBlockNumber);
658658
}
659659

660660
/*
@@ -703,7 +703,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
703703
/* Choose initial target page, re-using existing target if known */
704704
newblk = RelationGetTargetBlock(irel);
705705
if (newblk == InvalidBlockNumber)
706-
newblk = GetPageWithFreeSpace(irel, itemsz);
706+
newblk = GetPageWithFreeSpace(irel, itemsz, true);
707707

708708
/*
709709
* Loop until we find a page with sufficient free space. By the time we
@@ -895,7 +895,7 @@ brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
895895
* pages whose FSM records were forgotten in a crash.
896896
*/
897897
RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
898-
br_page_get_freespace(page));
898+
br_page_get_freespace(page), InvalidBlockNumber);
899899
}
900900

901901

src/backend/access/heap/hio.c

+30-17
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,14 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate)
239239
* Immediately update the bottom level of the FSM. This has a good
240240
* chance of making this page visible to other concurrently inserting
241241
* backends, and we want that to happen without delay.
242+
*
243+
* Since we know the table will end up with extraBlocks additional
244+
* pages, we pass the final number to avoid possible unnecessary
245+
* system calls and to make sure the FSM is created when we add the
246+
* first new page.
242247
*/
243-
RecordPageWithFreeSpace(relation, blockNum, freespace);
248+
RecordPageWithFreeSpace(relation, blockNum, freespace,
249+
firstBlock + extraBlocks);
244250
}
245251
while (--extraBlocks > 0);
246252

@@ -377,20 +383,9 @@ RelationGetBufferForTuple(Relation relation, Size len,
377383
* We have no cached target page, so ask the FSM for an initial
378384
* target.
379385
*/
380-
targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
381-
382-
/*
383-
* If the FSM knows nothing of the rel, try the last page before we
384-
* give up and extend. This avoids one-tuple-per-page syndrome during
385-
* bootstrapping or in a recently-started system.
386-
*/
387-
if (targetBlock == InvalidBlockNumber)
388-
{
389-
BlockNumber nblocks = RelationGetNumberOfBlocks(relation);
390-
391-
if (nblocks > 0)
392-
targetBlock = nblocks - 1;
393-
}
386+
targetBlock = GetPageWithFreeSpace(relation,
387+
len + saveFreeSpace,
388+
false);
394389
}
395390

396391
loop:
@@ -484,6 +479,14 @@ RelationGetBufferForTuple(Relation relation, Size len,
484479
{
485480
/* use this page as future insert target, too */
486481
RelationSetTargetBlock(relation, targetBlock);
482+
483+
/*
484+
* In case we used an in-memory map of available blocks, reset it
485+
* for next use.
486+
*/
487+
if (targetBlock < HEAP_FSM_CREATION_THRESHOLD)
488+
FSMClearLocalMap();
489+
487490
return buffer;
488491
}
489492

@@ -543,9 +546,12 @@ RelationGetBufferForTuple(Relation relation, Size len,
543546

544547
/*
545548
* Check if some other backend has extended a block for us while
546-
* we were waiting on the lock.
549+
* we were waiting on the lock. We only check the FSM -- if there
550+
* isn't one we don't recheck the number of blocks.
547551
*/
548-
targetBlock = GetPageWithFreeSpace(relation, len + saveFreeSpace);
552+
targetBlock = GetPageWithFreeSpace(relation,
553+
len + saveFreeSpace,
554+
true);
549555

550556
/*
551557
* If some other waiter has already extended the relation, we
@@ -625,5 +631,12 @@ RelationGetBufferForTuple(Relation relation, Size len,
625631
*/
626632
RelationSetTargetBlock(relation, BufferGetBlockNumber(buffer));
627633

634+
/*
635+
* In case we used an in-memory map of available blocks, reset it for next
636+
* use. We do this unconditionally since after relation extension we
637+
* can't skip this based on the targetBlock.
638+
*/
639+
FSMClearLocalMap();
640+
628641
return buffer;
629642
}

0 commit comments

Comments
 (0)