|
| 1 | +/*------------------------------------------------------------------------- |
| 2 | + * |
| 3 | + * checksum.c |
| 4 | + * Checksum implementation for data pages. |
| 5 | + * |
| 6 | + * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group |
| 7 | + * Portions Copyright (c) 1994, Regents of the University of California |
| 8 | + * |
| 9 | + * |
| 10 | + * IDENTIFICATION |
| 11 | + * src/backend/storage/page/checksum.c |
| 12 | + * |
| 13 | + *------------------------------------------------------------------------- |
| 14 | + * |
| 15 | + * Checksum algorithm |
| 16 | + * |
| 17 | + * The algorithm used to checksum pages is chosen for very fast calculation. |
| 18 | + * Workloads where the database working set fits into OS file cache but not |
| 19 | + * into shared buffers can read in pages at a very fast pace and the checksum |
| 20 | + * algorithm itself can become the largest bottleneck. |
| 21 | + * |
| 22 | + * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand |
| 23 | + * for Fowler/Noll/Vo) The primitive of a plain FNV-1a hash folds in data 1 |
| 24 | + * byte at a time according to the formula: |
| 25 | + * |
| 26 | + * hash = (hash ^ value) * FNV_PRIME |
| 27 | + * |
| 28 | + * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/ |
| 29 | + * |
| 30 | + * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of |
| 31 | + * high bits - high order bits in input data only affect high order bits in |
| 32 | + * output data. To resolve this we xor in the value prior to multiplication |
| 33 | + * shifted right by 17 bits. The number 17 was chosen because it doesn't |
| 34 | + * have common denominator with set bit positions in FNV_PRIME and empirically |
| 35 | + * provides the fastest mixing for high order bits of final iterations quickly |
| 36 | + * avalanche into lower positions. For performance reasons we choose to combine |
| 37 | + * 4 bytes at a time. The actual hash formula used as the basis is: |
| 38 | + * |
| 39 | + * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17) |
| 40 | + * |
| 41 | + * The main bottleneck in this calculation is the multiplication latency. To |
| 42 | + * hide the latency and to make use of SIMD parallelism multiple hash values |
| 43 | + * are calculated in parallel. The page is treated as a 32 column two |
| 44 | + * dimensional array of 32 bit values. Each column is aggregated separately |
| 45 | + * into a partial checksum. Each partial checksum uses a different initial |
| 46 | + * value (offset basis in FNV terminology). The initial values actually used |
| 47 | + * were chosen randomly, as the values themselves don't matter as much as that |
| 48 | + * they are different and don't match anything in real data. After initializing |
| 49 | + * partial checksums each value in the column is aggregated according to the |
| 50 | + * above formula. Finally two more iterations of the formula are performed with |
| 51 | + * value 0 to mix the bits of the last value added. |
| 52 | + * |
| 53 | + * The partial checksums are then folded together using xor to form a single |
| 54 | + * 32-bit checksum. The caller can safely reduce the value to 16 bits |
| 55 | + * using modulo 2^16-1. That will cause a very slight bias towards lower |
| 56 | + * values but this is not significant for the performance of the |
| 57 | + * checksum. |
| 58 | + * |
| 59 | + * The algorithm choice was based on what instructions are available in SIMD |
| 60 | + * instruction sets. This meant that a fast and good algorithm needed to use |
| 61 | + * multiplication as the main mixing operator. The simplest multiplication |
| 62 | + * based checksum primitive is the one used by FNV. The prime used is chosen |
| 63 | + * for good dispersion of values. It has no known simple patterns that result |
| 64 | + * in collisions. Test of 5-bit differentials of the primitive over 64bit keys |
| 65 | + * reveals no differentials with 3 or more values out of 100000 random keys |
| 66 | + * colliding. Avalanche test shows that only high order bits of the last word |
| 67 | + * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes, |
| 68 | + * overwriting page from random position to end with 0 bytes, and overwriting |
| 69 | + * random segments of page with 0x00, 0xFF and random data all show optimal |
| 70 | + * 2e-16 false positive rate within margin of error. |
| 71 | + * |
| 72 | + * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer |
| 73 | + * multiplication instruction. As of 2013 the corresponding instruction is |
| 74 | + * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). |
| 75 | + * Vectorization requires a compiler to do the vectorization for us. For recent |
| 76 | + * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough |
| 77 | + * to achieve vectorization. |
| 78 | + * |
| 79 | + * The optimal amount of parallelism to use depends on CPU specific instruction |
| 80 | + * latency, SIMD instruction width, throughput and the amount of registers |
| 81 | + * available to hold intermediate state. Generally, more parallelism is better |
| 82 | + * up to the point that state doesn't fit in registers and extra load-store |
| 83 | + * instructions are needed to swap values in/out. The number chosen is a fixed |
| 84 | + * part of the algorithm because changing the parallelism changes the checksum |
| 85 | + * result. |
| 86 | + * |
| 87 | + * The parallelism number 32 was chosen based on the fact that it is the |
| 88 | + * largest state that fits into architecturally visible x86 SSE registers while |
| 89 | + * leaving some free registers for intermediate values. For future processors |
| 90 | + * with 256bit vector registers this will leave some performance on the table. |
| 91 | + * When vectorization is not available it might be beneficial to restructure |
| 92 | + * the computation to calculate a subset of the columns at a time and perform |
| 93 | + * multiple passes to avoid register spilling. This optimization opportunity |
| 94 | + * is not used. Current coding also assumes that the compiler has the ability |
| 95 | + * to unroll the inner loop to avoid loop overhead and minimize register |
| 96 | + * spilling. For less sophisticated compilers it might be beneficial to manually |
| 97 | + * unroll the inner loop. |
| 98 | + */ |
| 99 | +#include "postgres.h" |
| 100 | + |
| 101 | +#include "storage/checksum.h" |
| 102 | + |
| 103 | +/* number of checksums to calculate in parallel */ |
| 104 | +#define N_SUMS 32 |
| 105 | +/* prime multiplier of FNV-1a hash */ |
| 106 | +#define FNV_PRIME 16777619 |
| 107 | + |
| 108 | +/* |
| 109 | + * Base offsets to initialize each of the parallel FNV hashes into a |
| 110 | + * different initial state. |
| 111 | + */ |
| 112 | +static const uint32 checksumBaseOffsets[N_SUMS] = { |
| 113 | + 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, |
| 114 | + 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, |
| 115 | + 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, |
| 116 | + 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, |
| 117 | + 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, |
| 118 | + 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, |
| 119 | + 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, |
| 120 | + 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756 |
| 121 | +}; |
| 122 | + |
| 123 | +/* |
| 124 | + * Calculate one round of the checksum. |
| 125 | + */ |
| 126 | +#define CHECKSUM_COMP(checksum, value) do {\ |
| 127 | + uint32 __tmp = (checksum) ^ (value);\ |
| 128 | + (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17);\ |
| 129 | +} while (0) |
| 130 | + |
| 131 | +uint32 |
| 132 | +checksum_block(char *data, uint32 size) |
| 133 | +{ |
| 134 | + uint32 sums[N_SUMS]; |
| 135 | + uint32 (*dataArr)[N_SUMS] = (uint32 (*)[N_SUMS]) data; |
| 136 | + uint32 result = 0; |
| 137 | + int i, j; |
| 138 | + |
| 139 | + /* ensure that the size is compatible with the algorithm */ |
| 140 | + Assert((size % (sizeof(uint32)*N_SUMS)) == 0); |
| 141 | + |
| 142 | + /* initialize partial checksums to their corresponding offsets */ |
| 143 | + memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); |
| 144 | + |
| 145 | + /* main checksum calculation */ |
| 146 | + for (i = 0; i < size/sizeof(uint32)/N_SUMS; i++) |
| 147 | + for (j = 0; j < N_SUMS; j++) |
| 148 | + CHECKSUM_COMP(sums[j], dataArr[i][j]); |
| 149 | + |
| 150 | + /* finally add in two rounds of zeroes for additional mixing */ |
| 151 | + for (i = 0; i < 2; i++) |
| 152 | + for (j = 0; j < N_SUMS; j++) |
| 153 | + CHECKSUM_COMP(sums[j], 0); |
| 154 | + |
| 155 | + /* xor fold partial checksums together */ |
| 156 | + for (i = 0; i < N_SUMS; i++) |
| 157 | + result ^= sums[i]; |
| 158 | + |
| 159 | + return result; |
| 160 | +} |
0 commit comments