43
43
#include "access/htup_details.h"
44
44
#include "nodes/bitmapset.h"
45
45
#include "nodes/tidbitmap.h"
46
- #include "utils/hsearch.h"
47
46
48
47
/*
49
48
* The maximum number of tuples per page is not large (typically 256 with
61
60
* for that page in the page table.
62
61
*
63
62
* We actually store both exact pages and lossy chunks in the same hash
64
- * table, using identical data structures. (This is because dynahash.c's
65
- * memory management doesn't allow space to be transferred easily from one
66
- * hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the
67
- * same as MAX_TUPLES_PER_PAGE, or at least not too different. But we
68
- * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
69
- * remainder operations. So, define it like this:
63
+ * table, using identical data structures. (This is because the memory
64
+ * management for hashtables doesn't easily/efficiently allow space to be
65
+ * transferred easily from one hashtable to another.) Therefore it's best
66
+ * if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
67
+ * too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
68
+ * avoid expensive integer remainder operations. So, define it like this:
70
69
*/
71
70
#define PAGES_PER_CHUNK (BLCKSZ / 32)
72
71
97
96
typedef struct PagetableEntry
98
97
{
99
98
BlockNumber blockno ; /* page number (hashtable key) */
99
+ char status ; /* hash entry status */
100
100
bool ischunk ; /* T = lossy storage, F = exact */
101
101
bool recheck ; /* should the tuples be rechecked? */
102
102
bitmapword words [Max (WORDS_PER_PAGE , WORDS_PER_CHUNK )];
103
103
} PagetableEntry ;
104
104
105
105
/*
106
- * dynahash.c is optimized for relatively large, long-lived hash tables.
107
- * This is not ideal for TIDBitMap, particularly when we are using a bitmap
108
- * scan on the inside of a nestloop join: a bitmap may well live only long
109
- * enough to accumulate one entry in such cases. We therefore avoid creating
110
- * an actual hashtable until we need two pagetable entries. When just one
111
- * pagetable entry is needed, we store it in a fixed field of TIDBitMap.
112
- * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
113
- * to zero or one page again. So, status can be TBM_HASH even when nentries
114
- * is zero or one.)
106
+ * We want to avoid the overhead of creating the hashtable, which is
107
+ * comparatively large, when not necessary. Particularly when we are using a
108
+ * bitmap scan on the inside of a nestloop join: a bitmap may well live only
109
+ * long enough to accumulate one entry in such cases. We therefore avoid
110
+ * creating an actual hashtable until we need two pagetable entries. When
111
+ * just one pagetable entry is needed, we store it in a fixed field of
112
+ * TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
113
+ * shrinks down to zero or one page again. So, status can be TBM_HASH even
114
+ * when nentries is zero or one.)
115
115
*/
116
116
typedef enum
117
117
{
@@ -128,12 +128,13 @@ struct TIDBitmap
128
128
NodeTag type ; /* to make it a valid Node */
129
129
MemoryContext mcxt ; /* memory context containing me */
130
130
TBMStatus status ; /* see codes above */
131
- HTAB * pagetable ; /* hash table of PagetableEntry's */
131
+ struct pagetable_hash * pagetable ; /* hash table of PagetableEntry's */
132
132
int nentries ; /* number of entries in pagetable */
133
133
int maxentries ; /* limit on same to meet maxbytes */
134
134
int npages ; /* number of exact entries in pagetable */
135
135
int nchunks ; /* number of lossy entries in pagetable */
136
136
bool iterating ; /* tbm_begin_iterate called? */
137
+ uint32 lossify_start ; /* offset to start lossifying hashtable at */
137
138
PagetableEntry entry1 ; /* used when status == TBM_ONE_PAGE */
138
139
/* these are valid when iterating is true: */
139
140
PagetableEntry * * spages ; /* sorted exact-page list, or NULL */
@@ -168,6 +169,35 @@ static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
168
169
static void tbm_lossify (TIDBitmap * tbm );
169
170
static int tbm_comparator (const void * left , const void * right );
170
171
172
+ /*
173
+ * Simple inline murmur hash implementation for the exact width required, for
174
+ * performance.
175
+ */
176
+ static inline uint32
177
+ hash_blockno (BlockNumber b )
178
+ {
179
+ uint32 h = b ;
180
+
181
+ h ^= h >> 16 ;
182
+ h *= 0x85ebca6b ;
183
+ h ^= h >> 13 ;
184
+ h *= 0xc2b2ae35 ;
185
+ h ^= h >> 16 ;
186
+ return h ;
187
+ }
188
+
189
+ /* define hashtable mapping block numbers to PagetableEntry's */
190
+ #define SH_PREFIX pagetable
191
+ #define SH_ELEMENT_TYPE PagetableEntry
192
+ #define SH_KEY_TYPE BlockNumber
193
+ #define SH_KEY blockno
194
+ #define SH_HASH_KEY (tb , key ) hash_blockno(key)
195
+ #define SH_EQUAL (tb , a , b ) a == b
196
+ #define SH_SCOPE static inline
197
+ #define SH_DEFINE
198
+ #define SH_DECLARE
199
+ #include "lib/simplehash.h"
200
+
171
201
172
202
/*
173
203
* tbm_create - create an initially-empty bitmap
@@ -190,17 +220,16 @@ tbm_create(long maxbytes)
190
220
191
221
/*
192
222
* Estimate number of hashtable entries we can have within maxbytes. This
193
- * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
194
- * pointer per hash entry, which is crude but good enough for our purpose.
195
- * Also count an extra Pointer per entry for the arrays created during
196
- * iteration readout.
223
+ * estimates the hash cost as sizeof(PagetableEntry), which is good enough
224
+ * for our purpose. Also count an extra Pointer per entry for the arrays
225
+ * created during iteration readout.
197
226
*/
198
227
nbuckets = maxbytes /
199
- (MAXALIGN (sizeof (HASHELEMENT )) + MAXALIGN (sizeof (PagetableEntry ))
200
- + sizeof (Pointer ) + sizeof (Pointer ));
228
+ (sizeof (PagetableEntry ) + sizeof (Pointer ) + sizeof (Pointer ));
201
229
nbuckets = Min (nbuckets , INT_MAX - 1 ); /* safety limit */
202
230
nbuckets = Max (nbuckets , 16 ); /* sanity limit */
203
231
tbm -> maxentries = (int ) nbuckets ;
232
+ tbm -> lossify_start = 0 ;
204
233
205
234
return tbm ;
206
235
}
@@ -212,32 +241,25 @@ tbm_create(long maxbytes)
212
241
static void
213
242
tbm_create_pagetable (TIDBitmap * tbm )
214
243
{
215
- HASHCTL hash_ctl ;
216
-
217
244
Assert (tbm -> status != TBM_HASH );
218
245
Assert (tbm -> pagetable == NULL );
219
246
220
- /* Create the hashtable proper */
221
- MemSet (& hash_ctl , 0 , sizeof (hash_ctl ));
222
- hash_ctl .keysize = sizeof (BlockNumber );
223
- hash_ctl .entrysize = sizeof (PagetableEntry );
224
- hash_ctl .hcxt = tbm -> mcxt ;
225
- tbm -> pagetable = hash_create ("TIDBitmap" ,
226
- 128 , /* start small and extend */
227
- & hash_ctl ,
228
- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT );
247
+ tbm -> pagetable = pagetable_create (tbm -> mcxt , 128 );
229
248
230
249
/* If entry1 is valid, push it into the hashtable */
231
250
if (tbm -> status == TBM_ONE_PAGE )
232
251
{
233
252
PagetableEntry * page ;
234
253
bool found ;
254
+ char oldstatus ;
235
255
236
- page = ( PagetableEntry * ) hash_search (tbm -> pagetable ,
237
- ( void * ) & tbm -> entry1 .blockno ,
238
- HASH_ENTER , & found );
256
+ page = pagetable_insert (tbm -> pagetable ,
257
+ tbm -> entry1 .blockno ,
258
+ & found );
239
259
Assert (!found );
260
+ oldstatus = page -> status ;
240
261
memcpy (page , & tbm -> entry1 , sizeof (PagetableEntry ));
262
+ page -> status = oldstatus ;
241
263
}
242
264
243
265
tbm -> status = TBM_HASH ;
250
272
tbm_free (TIDBitmap * tbm )
251
273
{
252
274
if (tbm -> pagetable )
253
- hash_destroy (tbm -> pagetable );
275
+ pagetable_destroy (tbm -> pagetable );
254
276
if (tbm -> spages )
255
277
pfree (tbm -> spages );
256
278
if (tbm -> schunks )
@@ -357,12 +379,12 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b)
357
379
tbm_union_page (a , & b -> entry1 );
358
380
else
359
381
{
360
- HASH_SEQ_STATUS status ;
382
+ pagetable_iterator i ;
361
383
PagetableEntry * bpage ;
362
384
363
385
Assert (b -> status == TBM_HASH );
364
- hash_seq_init ( & status , b -> pagetable );
365
- while ((bpage = ( PagetableEntry * ) hash_seq_search ( & status )) != NULL )
386
+ pagetable_start_iterate ( b -> pagetable , & i );
387
+ while ((bpage = pagetable_iterate ( b -> pagetable , & i )) != NULL )
366
388
tbm_union_page (a , bpage );
367
389
}
368
390
}
@@ -449,12 +471,12 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
449
471
}
450
472
else
451
473
{
452
- HASH_SEQ_STATUS status ;
474
+ pagetable_iterator i ;
453
475
PagetableEntry * apage ;
454
476
455
477
Assert (a -> status == TBM_HASH );
456
- hash_seq_init ( & status , a -> pagetable );
457
- while ((apage = ( PagetableEntry * ) hash_seq_search ( & status )) != NULL )
478
+ pagetable_start_iterate ( a -> pagetable , & i );
479
+ while ((apage = pagetable_iterate ( a -> pagetable , & i )) != NULL )
458
480
{
459
481
if (tbm_intersect_page (a , apage , b ))
460
482
{
@@ -464,9 +486,7 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
464
486
else
465
487
a -> npages -- ;
466
488
a -> nentries -- ;
467
- if (hash_search (a -> pagetable ,
468
- (void * ) & apage -> blockno ,
469
- HASH_REMOVE , NULL ) == NULL )
489
+ if (!pagetable_delete (a -> pagetable , apage -> blockno ))
470
490
elog (ERROR , "hash table corrupted" );
471
491
}
472
492
}
@@ -606,7 +626,7 @@ tbm_begin_iterate(TIDBitmap *tbm)
606
626
*/
607
627
if (tbm -> status == TBM_HASH && !tbm -> iterating )
608
628
{
609
- HASH_SEQ_STATUS status ;
629
+ pagetable_iterator i ;
610
630
PagetableEntry * page ;
611
631
int npages ;
612
632
int nchunks ;
@@ -620,9 +640,9 @@ tbm_begin_iterate(TIDBitmap *tbm)
620
640
MemoryContextAlloc (tbm -> mcxt ,
621
641
tbm -> nchunks * sizeof (PagetableEntry * ));
622
642
623
- hash_seq_init (& status , tbm -> pagetable );
624
643
npages = nchunks = 0 ;
625
- while ((page = (PagetableEntry * ) hash_seq_search (& status )) != NULL )
644
+ pagetable_start_iterate (tbm -> pagetable , & i );
645
+ while ((page = pagetable_iterate (tbm -> pagetable , & i )) != NULL )
626
646
{
627
647
if (page -> ischunk )
628
648
tbm -> schunks [nchunks ++ ] = page ;
@@ -791,9 +811,7 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
791
811
return page ;
792
812
}
793
813
794
- page = (PagetableEntry * ) hash_search (tbm -> pagetable ,
795
- (void * ) & pageno ,
796
- HASH_FIND , NULL );
814
+ page = pagetable_lookup (tbm -> pagetable , pageno );
797
815
if (page == NULL )
798
816
return NULL ;
799
817
if (page -> ischunk )
@@ -834,15 +852,16 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
834
852
}
835
853
836
854
/* Look up or create an entry */
837
- page = (PagetableEntry * ) hash_search (tbm -> pagetable ,
838
- (void * ) & pageno ,
839
- HASH_ENTER , & found );
855
+ page = pagetable_insert (tbm -> pagetable , pageno , & found );
840
856
}
841
857
842
858
/* Initialize it if not present before */
843
859
if (!found )
844
860
{
861
+ char oldstatus = page -> status ;
862
+
845
863
MemSet (page , 0 , sizeof (PagetableEntry ));
864
+ page -> status = oldstatus ;
846
865
page -> blockno = pageno ;
847
866
/* must count it too */
848
867
tbm -> nentries ++ ;
@@ -869,9 +888,9 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
869
888
870
889
bitno = pageno % PAGES_PER_CHUNK ;
871
890
chunk_pageno = pageno - bitno ;
872
- page = ( PagetableEntry * ) hash_search ( tbm -> pagetable ,
873
- ( void * ) & chunk_pageno ,
874
- HASH_FIND , NULL );
891
+
892
+ page = pagetable_lookup ( tbm -> pagetable , chunk_pageno );
893
+
875
894
if (page != NULL && page -> ischunk )
876
895
{
877
896
int wordnum = WORDNUM (bitno );
@@ -912,9 +931,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
912
931
*/
913
932
if (bitno != 0 )
914
933
{
915
- if (hash_search (tbm -> pagetable ,
916
- (void * ) & pageno ,
917
- HASH_REMOVE , NULL ) != NULL )
934
+ if (pagetable_delete (tbm -> pagetable , pageno ))
918
935
{
919
936
/* It was present, so adjust counts */
920
937
tbm -> nentries -- ;
@@ -923,14 +940,15 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
923
940
}
924
941
925
942
/* Look up or create entry for chunk-header page */
926
- page = (PagetableEntry * ) hash_search (tbm -> pagetable ,
927
- (void * ) & chunk_pageno ,
928
- HASH_ENTER , & found );
943
+ page = pagetable_insert (tbm -> pagetable , chunk_pageno , & found );
929
944
930
945
/* Initialize it if not present before */
931
946
if (!found )
932
947
{
948
+ char oldstatus = page -> status ;
949
+
933
950
MemSet (page , 0 , sizeof (PagetableEntry ));
951
+ page -> status = oldstatus ;
934
952
page -> blockno = chunk_pageno ;
935
953
page -> ischunk = true;
936
954
/* must count it too */
@@ -939,8 +957,11 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
939
957
}
940
958
else if (!page -> ischunk )
941
959
{
960
+ char oldstatus = page -> status ;
961
+
942
962
/* chunk header page was formerly non-lossy, make it lossy */
943
963
MemSet (page , 0 , sizeof (PagetableEntry ));
964
+ page -> status = oldstatus ;
944
965
page -> blockno = chunk_pageno ;
945
966
page -> ischunk = true;
946
967
/* we assume it had some tuple bit(s) set, so mark it lossy */
@@ -962,7 +983,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
962
983
static void
963
984
tbm_lossify (TIDBitmap * tbm )
964
985
{
965
- HASH_SEQ_STATUS status ;
986
+ pagetable_iterator i ;
966
987
PagetableEntry * page ;
967
988
968
989
/*
@@ -977,8 +998,8 @@ tbm_lossify(TIDBitmap *tbm)
977
998
Assert (!tbm -> iterating );
978
999
Assert (tbm -> status == TBM_HASH );
979
1000
980
- hash_seq_init ( & status , tbm -> pagetable );
981
- while ((page = ( PagetableEntry * ) hash_seq_search ( & status )) != NULL )
1001
+ pagetable_start_iterate_at ( tbm -> pagetable , & i , tbm -> lossify_start );
1002
+ while ((page = pagetable_iterate ( tbm -> pagetable , & i )) != NULL )
982
1003
{
983
1004
if (page -> ischunk )
984
1005
continue ; /* already a chunk header */
@@ -995,15 +1016,19 @@ tbm_lossify(TIDBitmap *tbm)
995
1016
996
1017
if (tbm -> nentries <= tbm -> maxentries / 2 )
997
1018
{
998
- /* we have done enough */
999
- hash_seq_term (& status );
1019
+ /*
1020
+ * We have made enough room. Remember where to start lossifying
1021
+ * next round, so we evenly iterate over the hashtable.
1022
+ */
1023
+ tbm -> lossify_start = i .cur ;
1000
1024
break ;
1001
1025
}
1002
1026
1003
1027
/*
1004
1028
* Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1005
- * hashtable. We can continue the same seq_search scan since we do
1006
- * not care whether we visit lossy chunks or not.
1029
+ * hashtable and may have deleted the non-lossy chunk. We can
1030
+ * continue the same hash table scan, since failure to visit one
1031
+ * element or visiting the newly inserted element, isn't fatal.
1007
1032
*/
1008
1033
}
1009
1034
0 commit comments