Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 43e7a66

Browse files
Introduce new page checksum algorithm and module.
Isolate checksum calculation to its own module, so that bufpage knows little if anything about the details of the calculation. This implementation is a modified FNV-1a hash checksum, details of which are given in the new checksum.c header comments. Basic implementation only, so we fix the output value. Later related commits will add version numbers to pg_control, compiler optimization flags and memory barriers. Ants Aasma, reviewed by Jeff Davis and Simon Riggs
1 parent f8db76e commit 43e7a66

File tree

4 files changed

+201
-20
lines changed

4 files changed

+201
-20
lines changed

src/backend/storage/page/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ subdir = src/backend/storage/page
1212
top_builddir = ../../../..
1313
include $(top_builddir)/src/Makefile.global
1414

15-
OBJS = bufpage.o itemptr.o
15+
OBJS = bufpage.o checksum.o itemptr.o
1616

1717
include $(top_srcdir)/src/backend/common.mk

src/backend/storage/page/bufpage.c

+17-19
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
#include "access/htup_details.h"
1818
#include "access/xlog.h"
19+
#include "storage/checksum.h"
1920

2021
bool ignore_checksum_failure = false;
2122

@@ -948,33 +949,30 @@ PageSetChecksumInplace(Page page, BlockNumber blkno)
948949
static uint16
949950
PageCalcChecksum16(Page page, BlockNumber blkno)
950951
{
951-
pg_crc32 crc;
952-
PageHeader p = (PageHeader) page;
952+
PageHeader phdr = (PageHeader) page;
953+
uint16 save_checksum;
954+
uint32 checksum;
953955

954956
/* only calculate the checksum for properly-initialized pages */
955957
Assert(!PageIsNew(page));
956958

957-
INIT_CRC32(crc);
958-
959959
/*
960-
* Initialize the checksum calculation with the block number. This helps
961-
* catch corruption from whole blocks being transposed with other whole
962-
* blocks.
960+
* Save pd_checksum and set it to zero, so that the checksum calculation
961+
* isn't affected by the checksum stored on the page. We do this to
962+
* allow optimization of the checksum calculation on the whole block
963+
* in one go.
963964
*/
964-
COMP_CRC32(crc, &blkno, sizeof(blkno));
965+
save_checksum = phdr->pd_checksum;
966+
phdr->pd_checksum = 0;
967+
checksum = checksum_block(page, BLCKSZ);
968+
phdr->pd_checksum = save_checksum;
965969

966-
/*
967-
* Now add in the LSN, which is always the first field on the page.
968-
*/
969-
COMP_CRC32(crc, page, sizeof(p->pd_lsn));
970+
/* mix in the block number to detect transposed pages */
971+
checksum ^= blkno;
970972

971973
/*
972-
* Now add the rest of the page, skipping the pd_checksum field.
974+
* Reduce to a uint16 (to fit in the pd_checksum field) with an offset of
975+
* one. That avoids checksums of zero, which seems like a good idea.
973976
*/
974-
COMP_CRC32(crc, page + sizeof(p->pd_lsn) + sizeof(p->pd_checksum),
975-
BLCKSZ - sizeof(p->pd_lsn) - sizeof(p->pd_checksum));
976-
977-
FIN_CRC32(crc);
978-
979-
return (uint16) crc;
977+
return (checksum % 65535) + 1;
980978
}

src/backend/storage/page/checksum.c

+160
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* checksum.c
4+
* Checksum implementation for data pages.
5+
*
6+
* Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
7+
* Portions Copyright (c) 1994, Regents of the University of California
8+
*
9+
*
10+
* IDENTIFICATION
11+
* src/backend/storage/page/checksum.c
12+
*
13+
*-------------------------------------------------------------------------
14+
*
15+
* Checksum algorithm
16+
*
17+
* The algorithm used to checksum pages is chosen for very fast calculation.
18+
* Workloads where the database working set fits into OS file cache but not
19+
* into shared buffers can read in pages at a very fast pace and the checksum
20+
* algorithm itself can become the largest bottleneck.
21+
*
22+
* The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand
23+
* for Fowler/Noll/Vo) The primitive of a plain FNV-1a hash folds in data 1
24+
* byte at a time according to the formula:
25+
*
26+
* hash = (hash ^ value) * FNV_PRIME
27+
*
28+
* FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/
29+
*
30+
* PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of
31+
* high bits - high order bits in input data only affect high order bits in
32+
* output data. To resolve this we xor in the value prior to multiplication
33+
* shifted right by 17 bits. The number 17 was chosen because it doesn't
34+
* have common denominator with set bit positions in FNV_PRIME and empirically
35+
* provides the fastest mixing for high order bits of final iterations quickly
36+
* avalanche into lower positions. For performance reasons we choose to combine
37+
* 4 bytes at a time. The actual hash formula used as the basis is:
38+
*
39+
* hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17)
40+
*
41+
* The main bottleneck in this calculation is the multiplication latency. To
42+
* hide the latency and to make use of SIMD parallelism multiple hash values
43+
* are calculated in parallel. The page is treated as a 32 column two
44+
* dimensional array of 32 bit values. Each column is aggregated separately
45+
* into a partial checksum. Each partial checksum uses a different initial
46+
* value (offset basis in FNV terminology). The initial values actually used
47+
* were chosen randomly, as the values themselves don't matter as much as that
48+
* they are different and don't match anything in real data. After initializing
49+
* partial checksums each value in the column is aggregated according to the
50+
* above formula. Finally two more iterations of the formula are performed with
51+
* value 0 to mix the bits of the last value added.
52+
*
53+
* The partial checksums are then folded together using xor to form a single
54+
* 32-bit checksum. The caller can safely reduce the value to 16 bits
55+
* using modulo 2^16-1. That will cause a very slight bias towards lower
56+
* values but this is not significant for the performance of the
57+
* checksum.
58+
*
59+
* The algorithm choice was based on what instructions are available in SIMD
60+
* instruction sets. This meant that a fast and good algorithm needed to use
61+
* multiplication as the main mixing operator. The simplest multiplication
62+
* based checksum primitive is the one used by FNV. The prime used is chosen
63+
* for good dispersion of values. It has no known simple patterns that result
64+
* in collisions. Test of 5-bit differentials of the primitive over 64bit keys
65+
* reveals no differentials with 3 or more values out of 100000 random keys
66+
* colliding. Avalanche test shows that only high order bits of the last word
67+
* have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes,
68+
* overwriting page from random position to end with 0 bytes, and overwriting
69+
* random segments of page with 0x00, 0xFF and random data all show optimal
70+
* 2e-16 false positive rate within margin of error.
71+
*
72+
* Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
73+
* multiplication instruction. As of 2013 the corresponding instruction is
74+
* available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
75+
* Vectorization requires a compiler to do the vectorization for us. For recent
76+
* GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
77+
* to achieve vectorization.
78+
*
79+
* The optimal amount of parallelism to use depends on CPU specific instruction
80+
* latency, SIMD instruction width, throughput and the amount of registers
81+
* available to hold intermediate state. Generally, more parallelism is better
82+
* up to the point that state doesn't fit in registers and extra load-store
83+
* instructions are needed to swap values in/out. The number chosen is a fixed
84+
* part of the algorithm because changing the parallelism changes the checksum
85+
* result.
86+
*
87+
* The parallelism number 32 was chosen based on the fact that it is the
88+
* largest state that fits into architecturally visible x86 SSE registers while
89+
* leaving some free registers for intermediate values. For future processors
90+
* with 256bit vector registers this will leave some performance on the table.
91+
* When vectorization is not available it might be beneficial to restructure
92+
* the computation to calculate a subset of the columns at a time and perform
93+
* multiple passes to avoid register spilling. This optimization opportunity
94+
* is not used. Current coding also assumes that the compiler has the ability
95+
* to unroll the inner loop to avoid loop overhead and minimize register
96+
* spilling. For less sophisticated compilers it might be beneficial to manually
97+
* unroll the inner loop.
98+
*/
99+
#include "postgres.h"
100+
101+
#include "storage/checksum.h"
102+
103+
/* number of checksums to calculate in parallel */
104+
#define N_SUMS 32
105+
/* prime multiplier of FNV-1a hash */
106+
#define FNV_PRIME 16777619
107+
108+
/*
109+
* Base offsets to initialize each of the parallel FNV hashes into a
110+
* different initial state.
111+
*/
112+
static const uint32 checksumBaseOffsets[N_SUMS] = {
113+
0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A,
114+
0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C,
115+
0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA,
116+
0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB,
117+
0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE,
118+
0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4,
119+
0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E,
120+
0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756
121+
};
122+
123+
/*
124+
* Calculate one round of the checksum.
125+
*/
126+
#define CHECKSUM_COMP(checksum, value) do {\
127+
uint32 __tmp = (checksum) ^ (value);\
128+
(checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17);\
129+
} while (0)
130+
131+
uint32
132+
checksum_block(char *data, uint32 size)
133+
{
134+
uint32 sums[N_SUMS];
135+
uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data;
136+
uint32 result = 0;
137+
int i, j;
138+
139+
/* ensure that the size is compatible with the algorithm */
140+
Assert((size % (sizeof(uint32)*N_SUMS)) == 0);
141+
142+
/* initialize partial checksums to their corresponding offsets */
143+
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
144+
145+
/* main checksum calculation */
146+
for (i = 0; i < size/sizeof(uint32)/N_SUMS; i++)
147+
for (j = 0; j < N_SUMS; j++)
148+
CHECKSUM_COMP(sums[j], dataArr[i][j]);
149+
150+
/* finally add in two rounds of zeroes for additional mixing */
151+
for (i = 0; i < 2; i++)
152+
for (j = 0; j < N_SUMS; j++)
153+
CHECKSUM_COMP(sums[j], 0);
154+
155+
/* xor fold partial checksums together */
156+
for (i = 0; i < N_SUMS; i++)
157+
result ^= sums[i];
158+
159+
return result;
160+
}

src/include/storage/checksum.h

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* checksum.h
4+
* Checksum implementation for data pages.
5+
*
6+
*
7+
* Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
8+
* Portions Copyright (c) 1994, Regents of the University of California
9+
*
10+
* src/include/storage/checksum.h
11+
*
12+
*-------------------------------------------------------------------------
13+
*/
14+
#ifndef CHECKSUM_H
15+
#define CHECKSUM_H
16+
17+
/*
18+
* Fowler-Noll-Vo 1a block checksum algorithm. The data argument should be
19+
* aligned on a 4-byte boundary.
20+
*/
21+
extern uint32 checksum_block(char *data, uint32 size);
22+
23+
#endif /* CHECKSUM_H */

0 commit comments

Comments
 (0)