Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 75ae538

Browse files
committed
Use more efficient hashtable for tidbitmap.c to speed up bitmap scans.
Use the new simplehash.h to speed up tidbitmap.c uses. For bitmap scan heavy queries speedups of over 100% have been measured. Both lossy and exact scans benefit, but the wins are bigger for mostly exact scans. The conversion is mostly trivial, except that tbm_lossify() now restarts lossifying at the point it previously stopped. Otherwise the hash table becomes unbalanced because the scan in done in hash-order, leaving the end of the hashtable more densely filled then the beginning. That caused performance issues with dynahash as well, but due to the open chaining they were less pronounced than with the linear adressing from simplehash.h. Reviewed-By: Tomas Vondra Discussion: <20160727004333.r3e2k2y6fvk2ntup@alap3.anarazel.de>
1 parent b30d3ea commit 75ae538

File tree

1 file changed

+97
-72
lines changed

1 file changed

+97
-72
lines changed

src/backend/nodes/tidbitmap.c

Lines changed: 97 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
#include "access/htup_details.h"
4444
#include "nodes/bitmapset.h"
4545
#include "nodes/tidbitmap.h"
46-
#include "utils/hsearch.h"
4746

4847
/*
4948
* The maximum number of tuples per page is not large (typically 256 with
@@ -61,12 +60,12 @@
6160
* for that page in the page table.
6261
*
6362
* We actually store both exact pages and lossy chunks in the same hash
64-
* table, using identical data structures. (This is because dynahash.c's
65-
* memory management doesn't allow space to be transferred easily from one
66-
* hashtable to another.) Therefore it's best if PAGES_PER_CHUNK is the
67-
* same as MAX_TUPLES_PER_PAGE, or at least not too different. But we
68-
* also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
69-
* remainder operations. So, define it like this:
63+
* table, using identical data structures. (This is because the memory
64+
* management for hashtables doesn't easily/efficiently allow space to be
65+
* transferred easily from one hashtable to another.) Therefore it's best
66+
* if PAGES_PER_CHUNK is the same as MAX_TUPLES_PER_PAGE, or at least not
67+
* too different. But we also want PAGES_PER_CHUNK to be a power of 2 to
68+
* avoid expensive integer remainder operations. So, define it like this:
7069
*/
7170
#define PAGES_PER_CHUNK (BLCKSZ / 32)
7271

@@ -97,21 +96,22 @@
9796
typedef struct PagetableEntry
9897
{
9998
BlockNumber blockno; /* page number (hashtable key) */
99+
char status; /* hash entry status */
100100
bool ischunk; /* T = lossy storage, F = exact */
101101
bool recheck; /* should the tuples be rechecked? */
102102
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
103103
} PagetableEntry;
104104

105105
/*
106-
* dynahash.c is optimized for relatively large, long-lived hash tables.
107-
* This is not ideal for TIDBitMap, particularly when we are using a bitmap
108-
* scan on the inside of a nestloop join: a bitmap may well live only long
109-
* enough to accumulate one entry in such cases. We therefore avoid creating
110-
* an actual hashtable until we need two pagetable entries. When just one
111-
* pagetable entry is needed, we store it in a fixed field of TIDBitMap.
112-
* (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
113-
* to zero or one page again. So, status can be TBM_HASH even when nentries
114-
* is zero or one.)
106+
* We want to avoid the overhead of creating the hashtable, which is
107+
* comparatively large, when not necessary. Particularly when we are using a
108+
* bitmap scan on the inside of a nestloop join: a bitmap may well live only
109+
* long enough to accumulate one entry in such cases. We therefore avoid
110+
* creating an actual hashtable until we need two pagetable entries. When
111+
* just one pagetable entry is needed, we store it in a fixed field of
112+
* TIDBitMap. (NOTE: we don't get rid of the hashtable if the bitmap later
113+
* shrinks down to zero or one page again. So, status can be TBM_HASH even
114+
* when nentries is zero or one.)
115115
*/
116116
typedef enum
117117
{
@@ -128,12 +128,13 @@ struct TIDBitmap
128128
NodeTag type; /* to make it a valid Node */
129129
MemoryContext mcxt; /* memory context containing me */
130130
TBMStatus status; /* see codes above */
131-
HTAB *pagetable; /* hash table of PagetableEntry's */
131+
struct pagetable_hash *pagetable; /* hash table of PagetableEntry's */
132132
int nentries; /* number of entries in pagetable */
133133
int maxentries; /* limit on same to meet maxbytes */
134134
int npages; /* number of exact entries in pagetable */
135135
int nchunks; /* number of lossy entries in pagetable */
136136
bool iterating; /* tbm_begin_iterate called? */
137+
uint32 lossify_start; /* offset to start lossifying hashtable at */
137138
PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
138139
/* these are valid when iterating is true: */
139140
PagetableEntry **spages; /* sorted exact-page list, or NULL */
@@ -168,6 +169,35 @@ static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
168169
static void tbm_lossify(TIDBitmap *tbm);
169170
static int tbm_comparator(const void *left, const void *right);
170171

172+
/*
173+
* Simple inline murmur hash implementation for the exact width required, for
174+
* performance.
175+
*/
176+
static inline uint32
177+
hash_blockno(BlockNumber b)
178+
{
179+
uint32 h = b;
180+
181+
h ^= h >> 16;
182+
h *= 0x85ebca6b;
183+
h ^= h >> 13;
184+
h *= 0xc2b2ae35;
185+
h ^= h >> 16;
186+
return h;
187+
}
188+
189+
/* define hashtable mapping block numbers to PagetableEntry's */
190+
#define SH_PREFIX pagetable
191+
#define SH_ELEMENT_TYPE PagetableEntry
192+
#define SH_KEY_TYPE BlockNumber
193+
#define SH_KEY blockno
194+
#define SH_HASH_KEY(tb, key) hash_blockno(key)
195+
#define SH_EQUAL(tb, a, b) a == b
196+
#define SH_SCOPE static inline
197+
#define SH_DEFINE
198+
#define SH_DECLARE
199+
#include "lib/simplehash.h"
200+
171201

172202
/*
173203
* tbm_create - create an initially-empty bitmap
@@ -190,17 +220,16 @@ tbm_create(long maxbytes)
190220

191221
/*
192222
* Estimate number of hashtable entries we can have within maxbytes. This
193-
* estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
194-
* pointer per hash entry, which is crude but good enough for our purpose.
195-
* Also count an extra Pointer per entry for the arrays created during
196-
* iteration readout.
223+
* estimates the hash cost as sizeof(PagetableEntry), which is good enough
224+
* for our purpose. Also count an extra Pointer per entry for the arrays
225+
* created during iteration readout.
197226
*/
198227
nbuckets = maxbytes /
199-
(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
200-
+ sizeof(Pointer) + sizeof(Pointer));
228+
(sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
201229
nbuckets = Min(nbuckets, INT_MAX - 1); /* safety limit */
202230
nbuckets = Max(nbuckets, 16); /* sanity limit */
203231
tbm->maxentries = (int) nbuckets;
232+
tbm->lossify_start = 0;
204233

205234
return tbm;
206235
}
@@ -212,32 +241,25 @@ tbm_create(long maxbytes)
212241
static void
213242
tbm_create_pagetable(TIDBitmap *tbm)
214243
{
215-
HASHCTL hash_ctl;
216-
217244
Assert(tbm->status != TBM_HASH);
218245
Assert(tbm->pagetable == NULL);
219246

220-
/* Create the hashtable proper */
221-
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
222-
hash_ctl.keysize = sizeof(BlockNumber);
223-
hash_ctl.entrysize = sizeof(PagetableEntry);
224-
hash_ctl.hcxt = tbm->mcxt;
225-
tbm->pagetable = hash_create("TIDBitmap",
226-
128, /* start small and extend */
227-
&hash_ctl,
228-
HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
247+
tbm->pagetable = pagetable_create(tbm->mcxt, 128);
229248

230249
/* If entry1 is valid, push it into the hashtable */
231250
if (tbm->status == TBM_ONE_PAGE)
232251
{
233252
PagetableEntry *page;
234253
bool found;
254+
char oldstatus;
235255

236-
page = (PagetableEntry *) hash_search(tbm->pagetable,
237-
(void *) &tbm->entry1.blockno,
238-
HASH_ENTER, &found);
256+
page = pagetable_insert(tbm->pagetable,
257+
tbm->entry1.blockno,
258+
&found);
239259
Assert(!found);
260+
oldstatus = page->status;
240261
memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
262+
page->status = oldstatus;
241263
}
242264

243265
tbm->status = TBM_HASH;
@@ -250,7 +272,7 @@ void
250272
tbm_free(TIDBitmap *tbm)
251273
{
252274
if (tbm->pagetable)
253-
hash_destroy(tbm->pagetable);
275+
pagetable_destroy(tbm->pagetable);
254276
if (tbm->spages)
255277
pfree(tbm->spages);
256278
if (tbm->schunks)
@@ -357,12 +379,12 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b)
357379
tbm_union_page(a, &b->entry1);
358380
else
359381
{
360-
HASH_SEQ_STATUS status;
382+
pagetable_iterator i;
361383
PagetableEntry *bpage;
362384

363385
Assert(b->status == TBM_HASH);
364-
hash_seq_init(&status, b->pagetable);
365-
while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
386+
pagetable_start_iterate(b->pagetable, &i);
387+
while ((bpage = pagetable_iterate(b->pagetable, &i)) != NULL)
366388
tbm_union_page(a, bpage);
367389
}
368390
}
@@ -449,12 +471,12 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
449471
}
450472
else
451473
{
452-
HASH_SEQ_STATUS status;
474+
pagetable_iterator i;
453475
PagetableEntry *apage;
454476

455477
Assert(a->status == TBM_HASH);
456-
hash_seq_init(&status, a->pagetable);
457-
while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
478+
pagetable_start_iterate(a->pagetable, &i);
479+
while ((apage = pagetable_iterate(a->pagetable, &i)) != NULL)
458480
{
459481
if (tbm_intersect_page(a, apage, b))
460482
{
@@ -464,9 +486,7 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
464486
else
465487
a->npages--;
466488
a->nentries--;
467-
if (hash_search(a->pagetable,
468-
(void *) &apage->blockno,
469-
HASH_REMOVE, NULL) == NULL)
489+
if (!pagetable_delete(a->pagetable, apage->blockno))
470490
elog(ERROR, "hash table corrupted");
471491
}
472492
}
@@ -606,7 +626,7 @@ tbm_begin_iterate(TIDBitmap *tbm)
606626
*/
607627
if (tbm->status == TBM_HASH && !tbm->iterating)
608628
{
609-
HASH_SEQ_STATUS status;
629+
pagetable_iterator i;
610630
PagetableEntry *page;
611631
int npages;
612632
int nchunks;
@@ -620,9 +640,9 @@ tbm_begin_iterate(TIDBitmap *tbm)
620640
MemoryContextAlloc(tbm->mcxt,
621641
tbm->nchunks * sizeof(PagetableEntry *));
622642

623-
hash_seq_init(&status, tbm->pagetable);
624643
npages = nchunks = 0;
625-
while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
644+
pagetable_start_iterate(tbm->pagetable, &i);
645+
while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
626646
{
627647
if (page->ischunk)
628648
tbm->schunks[nchunks++] = page;
@@ -791,9 +811,7 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
791811
return page;
792812
}
793813

794-
page = (PagetableEntry *) hash_search(tbm->pagetable,
795-
(void *) &pageno,
796-
HASH_FIND, NULL);
814+
page = pagetable_lookup(tbm->pagetable, pageno);
797815
if (page == NULL)
798816
return NULL;
799817
if (page->ischunk)
@@ -834,15 +852,16 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
834852
}
835853

836854
/* Look up or create an entry */
837-
page = (PagetableEntry *) hash_search(tbm->pagetable,
838-
(void *) &pageno,
839-
HASH_ENTER, &found);
855+
page = pagetable_insert(tbm->pagetable, pageno, &found);
840856
}
841857

842858
/* Initialize it if not present before */
843859
if (!found)
844860
{
861+
char oldstatus = page->status;
862+
845863
MemSet(page, 0, sizeof(PagetableEntry));
864+
page->status = oldstatus;
846865
page->blockno = pageno;
847866
/* must count it too */
848867
tbm->nentries++;
@@ -869,9 +888,9 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
869888

870889
bitno = pageno % PAGES_PER_CHUNK;
871890
chunk_pageno = pageno - bitno;
872-
page = (PagetableEntry *) hash_search(tbm->pagetable,
873-
(void *) &chunk_pageno,
874-
HASH_FIND, NULL);
891+
892+
page = pagetable_lookup(tbm->pagetable, chunk_pageno);
893+
875894
if (page != NULL && page->ischunk)
876895
{
877896
int wordnum = WORDNUM(bitno);
@@ -912,9 +931,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
912931
*/
913932
if (bitno != 0)
914933
{
915-
if (hash_search(tbm->pagetable,
916-
(void *) &pageno,
917-
HASH_REMOVE, NULL) != NULL)
934+
if (pagetable_delete(tbm->pagetable, pageno))
918935
{
919936
/* It was present, so adjust counts */
920937
tbm->nentries--;
@@ -923,14 +940,15 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
923940
}
924941

925942
/* Look up or create entry for chunk-header page */
926-
page = (PagetableEntry *) hash_search(tbm->pagetable,
927-
(void *) &chunk_pageno,
928-
HASH_ENTER, &found);
943+
page = pagetable_insert(tbm->pagetable, chunk_pageno, &found);
929944

930945
/* Initialize it if not present before */
931946
if (!found)
932947
{
948+
char oldstatus = page->status;
949+
933950
MemSet(page, 0, sizeof(PagetableEntry));
951+
page->status = oldstatus;
934952
page->blockno = chunk_pageno;
935953
page->ischunk = true;
936954
/* must count it too */
@@ -939,8 +957,11 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
939957
}
940958
else if (!page->ischunk)
941959
{
960+
char oldstatus = page->status;
961+
942962
/* chunk header page was formerly non-lossy, make it lossy */
943963
MemSet(page, 0, sizeof(PagetableEntry));
964+
page->status = oldstatus;
944965
page->blockno = chunk_pageno;
945966
page->ischunk = true;
946967
/* we assume it had some tuple bit(s) set, so mark it lossy */
@@ -962,7 +983,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
962983
static void
963984
tbm_lossify(TIDBitmap *tbm)
964985
{
965-
HASH_SEQ_STATUS status;
986+
pagetable_iterator i;
966987
PagetableEntry *page;
967988

968989
/*
@@ -977,8 +998,8 @@ tbm_lossify(TIDBitmap *tbm)
977998
Assert(!tbm->iterating);
978999
Assert(tbm->status == TBM_HASH);
9791000

980-
hash_seq_init(&status, tbm->pagetable);
981-
while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
1001+
pagetable_start_iterate_at(tbm->pagetable, &i, tbm->lossify_start);
1002+
while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
9821003
{
9831004
if (page->ischunk)
9841005
continue; /* already a chunk header */
@@ -995,15 +1016,19 @@ tbm_lossify(TIDBitmap *tbm)
9951016

9961017
if (tbm->nentries <= tbm->maxentries / 2)
9971018
{
998-
/* we have done enough */
999-
hash_seq_term(&status);
1019+
/*
1020+
* We have made enough room. Remember where to start lossifying
1021+
* next round, so we evenly iterate over the hashtable.
1022+
*/
1023+
tbm->lossify_start = i.cur;
10001024
break;
10011025
}
10021026

10031027
/*
10041028
* Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
1005-
* hashtable. We can continue the same seq_search scan since we do
1006-
* not care whether we visit lossy chunks or not.
1029+
* hashtable and may have deleted the non-lossy chunk. We can
1030+
* continue the same hash table scan, since failure to visit one
1031+
* element or visiting the newly inserted element, isn't fatal.
10071032
*/
10081033
}
10091034

0 commit comments

Comments
 (0)