53
53
* fasthash as implemented here has two interfaces:
54
54
*
55
55
* 1) Standalone functions, e.g. fasthash32() for a single value with a
56
- * known length.
56
+ * known length. These return the same hash code as the original, at
57
+ * least on little-endian machines.
57
58
*
58
59
* 2) Incremental interface. This can used for incorporating multiple
59
- * inputs. The standalone functions use this internally, so see fasthash64()
60
- * for an an example of how this works.
61
- *
62
- * The incremental interface is especially useful if any of the inputs
63
- * are NUL-terminated C strings, since the length is not needed ahead
64
- * of time. This avoids needing to call strlen(). This case is optimized
65
- * in fasthash_accum_cstring() :
60
+ * inputs. First, initialize the hash state (here with a zero seed):
66
61
*
67
62
* fasthash_state hs;
68
63
* fasthash_init(&hs, 0);
69
- * len = fasthash_accum_cstring(&hs, *str);
64
+ *
65
+ * If the inputs are of types that can be trivially cast to uint64, it's
66
+ * sufficient to do:
67
+ *
68
+ * hs.accum = value1;
69
+ * fasthash_combine(&hs);
70
+ * hs.accum = value2;
71
+ * fasthash_combine(&hs);
70
72
* ...
71
- * return fasthash_final32(&hs, len);
72
73
*
73
- * The length is computed on-the-fly. Experimentation has found that
74
+ * For longer or variable-length input, fasthash_accum() is a more
75
+ * flexible, but more verbose method. The standalone functions use this
76
+ * internally, so see fasthash64() for an an example of this.
77
+ *
78
+ * After all inputs have been mixed in, finalize the hash:
79
+ *
80
+ * hashcode = fasthash_final32(&hs, 0);
81
+ *
82
+ * The incremental interface allows an optimization for NUL-terminated
83
+ * C strings:
84
+ *
85
+ * len = fasthash_accum_cstring(&hs, str);
86
+ * hashcode = fasthash_final32(&hs, len);
87
+ *
88
+ * By handling the terminator on-the-fly, we can avoid needing a strlen()
89
+ * call to tell us how many bytes to hash. Experimentation has found that
74
90
* SMHasher fails unless we incorporate the length, so it is passed to
75
91
* the finalizer as a tweak.
76
92
*/
@@ -204,26 +220,33 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
204
220
{
205
221
const char * const start = str ;
206
222
int remainder ;
207
- uint64 zero_bytes_le ;
223
+ uint64 zero_byte_low ;
208
224
209
225
Assert (PointerIsAligned (start , uint64 ));
226
+
227
+ /*
228
+ * For every chunk of input, check for zero bytes before mixing into the
229
+ * hash. The chunk with zeros must contain the NUL terminator. We arrange
230
+ * so that zero_byte_low tells us not only that a zero exists, but also
231
+ * where it is, so we can hash the remainder of the string.
232
+ *
233
+ * The haszero64 calculation will set bits corresponding to the lowest
234
+ * byte where a zero exists, so that suffices for little-endian machines.
235
+ * For big-endian machines, we would need bits set for the highest zero
236
+ * byte in the chunk, since the trailing junk past the terminator could
237
+ * contain additional zeros. haszero64 does not give us that, so we
238
+ * byteswap the chunk first.
239
+ */
210
240
for (;;)
211
241
{
212
242
uint64 chunk = * (uint64 * ) str ;
213
243
214
- /*
215
- * With little-endian representation, we can use this calculation,
216
- * which sets bits in the first byte in the result word that
217
- * corresponds to a zero byte in the original word. The rest of the
218
- * bytes are indeterminate, so cannot be used on big-endian machines
219
- * without either swapping or a bytewise check.
220
- */
221
244
#ifdef WORDS_BIGENDIAN
222
- zero_bytes_le = haszero64 (pg_bswap64 (chunk ));
245
+ zero_byte_low = haszero64 (pg_bswap64 (chunk ));
223
246
#else
224
- zero_bytes_le = haszero64 (chunk );
247
+ zero_byte_low = haszero64 (chunk );
225
248
#endif
226
- if (zero_bytes_le )
249
+ if (zero_byte_low )
227
250
break ;
228
251
229
252
hs -> accum = chunk ;
@@ -232,12 +255,11 @@ fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
232
255
}
233
256
234
257
/*
235
- * For the last word, only use bytes up to the NUL for the hash. Bytes
236
- * with set bits will be 0x80, so calculate the first occurrence of a zero
237
- * byte within the input word by counting the number of trailing (because
238
- * little-endian) zeros and dividing the result by 8.
258
+ * The byte corresponding to the NUL will be 0x80, so the rightmost bit
259
+ * position will be in the range 7, 15, ..., 63. Turn this into byte
260
+ * position by dividing by 8.
239
261
*/
240
- remainder = pg_rightmost_one_pos64 (zero_bytes_le ) / BITS_PER_BYTE ;
262
+ remainder = pg_rightmost_one_pos64 (zero_byte_low ) / BITS_PER_BYTE ;
241
263
fasthash_accum (hs , str , remainder );
242
264
str += remainder ;
243
265
0 commit comments