Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit fe46124

Browse files
RahilaCommitfest Bot
authored and
Commitfest Bot
committed
Improve accounting for memory used by shared hash tables
pg_shmem_allocations tracks the memory allocated by ShmemInitStruct(), but for shared hash tables that covered only the header and hash directory. The remaining parts (segments and buckets) were allocated later using ShmemAlloc(), which does not update the shmem accounting. Thus, these allocations were not shown in pg_shmem_allocations. This commit improves the situation by allocating all the hash table parts at once, using a single ShmemInitStruct() call. This way the ShmemIndex entries (and thus pg_shmem_allocations) better reflect the proper size of the hash table.u This does not change anything for non-shared hash tables. This changes the alignment a bit. ShmemAlloc() aligns the chunks using CACHELINEALIGN(), which means some parts (header, directory, segments) were aligned this way. Allocating all parts as a single chunk removes this (implicit) alignment. We've considered adding explicit alignment, but we've decided not to - it seems to be merely a coincidence due to using the ShmemAlloc() API, not due to necessity. Author: Rahila Syed <rahilasyed90@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com> Reviewed-by: Tomas Vondra <tomas@vondra.me> Discussion: https://postgr.es/m/CAH2L28vHzRankszhqz7deXURxKncxfirnuW68zD7+hVAqaS5GQ@mail.gmail.com
1 parent 3c4d755 commit fe46124

File tree

3 files changed

+198
-51
lines changed

3 files changed

+198
-51
lines changed

src/backend/storage/ipc/shmem.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
#include "storage/shmem.h"
7575
#include "storage/spin.h"
7676
#include "utils/builtins.h"
77+
#include "utils/dynahash.h"
7778

7879
static void *ShmemAllocRaw(Size size, Size *allocated_size);
7980

@@ -351,7 +352,8 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
351352

352353
/* look it up in the shmem index */
353354
location = ShmemInitStruct(name,
354-
hash_get_shared_size(infoP, hash_flags),
355+
hash_get_shared_size(infoP, hash_flags,
356+
init_size),
355357
&found);
356358

357359
/*

src/backend/utils/hash/dynahash.c

Lines changed: 193 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -260,12 +260,39 @@ static long hash_accesses,
260260
hash_expansions;
261261
#endif
262262

263+
/* access to parts of the hash table, allocated as a single chunk */
264+
#define HASH_DIRECTORY_PTR(hashp) \
265+
(((char *) (hashp)->hctl) + sizeof(HASHHDR))
266+
267+
#define HASH_SEGMENT_OFFSET(hctl, idx) \
268+
(sizeof(HASHHDR) + \
269+
((hctl)->dsize * sizeof(HASHSEGMENT)) + \
270+
((hctl)->ssize * (idx) * sizeof(HASHBUCKET)))
271+
272+
#define HASH_SEGMENT_PTR(hashp, idx) \
273+
((char *) (hashp)->hctl + HASH_SEGMENT_OFFSET((hashp)->hctl, (idx)))
274+
275+
#define HASH_SEGMENT_SIZE(hashp) \
276+
((hashp)->ssize * sizeof(HASHBUCKET))
277+
278+
#define HASH_ELEMENTS_PTR(hashp, nsegs) \
279+
((char *) (hashp)->hctl + HASH_SEGMENT_OFFSET((hashp)->hctl, nsegs))
280+
281+
/* Each element has a HASHELEMENT header plus user data. */
282+
#define HASH_ELEMENT_SIZE(hctl) \
283+
(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN((hctl)->entrysize))
284+
285+
#define HASH_ELEMENT_NEXT(hctl, num, ptr) \
286+
((char *) (ptr) + ((num) * HASH_ELEMENT_SIZE(hctl)))
287+
263288
/*
264289
* Private function prototypes
265290
*/
266291
static void *DynaHashAlloc(Size size);
267292
static HASHSEGMENT seg_alloc(HTAB *hashp);
268-
static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
293+
static HASHELEMENT *element_alloc(HTAB *hashp, int nelem);
294+
static void element_add(HTAB *hashp, HASHELEMENT *firstElement,
295+
int nelem, int freelist_idx);
269296
static bool dir_realloc(HTAB *hashp);
270297
static bool expand_table(HTAB *hashp);
271298
static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
@@ -280,6 +307,9 @@ static int next_pow2_int(long num);
280307
static void register_seq_scan(HTAB *hashp);
281308
static void deregister_seq_scan(HTAB *hashp);
282309
static bool has_seq_scans(HTAB *hashp);
310+
static void compute_buckets_and_segs(long nelem, long num_partitions,
311+
long ssize,
312+
int *nbuckets, int *nsegments);
283313

284314

285315
/*
@@ -569,12 +599,12 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
569599
elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
570600

571601
/*
602+
* For a private hash table, preallocate the requested number of elements
603+
* if it's less than our chosen nelem_alloc. This avoids wasting space if
604+
* the caller correctly estimates a small table size.
605+
*
572606
* For a shared hash table, preallocate the requested number of elements.
573607
* This reduces problems with run-time out-of-shared-memory conditions.
574-
*
575-
* For a non-shared hash table, preallocate the requested number of
576-
* elements if it's less than our chosen nelem_alloc. This avoids wasting
577-
* space if the caller correctly estimates a small table size.
578608
*/
579609
if ((flags & HASH_SHARED_MEM) ||
580610
nelem < hctl->nelem_alloc)
@@ -583,6 +613,7 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
583613
freelist_partitions,
584614
nelem_alloc,
585615
nelem_alloc_first;
616+
void *ptr = NULL;
586617

587618
/*
588619
* If hash table is partitioned, give each freelist an equal share of
@@ -607,14 +638,42 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
607638
else
608639
nelem_alloc_first = nelem_alloc;
609640

641+
/*
642+
* For a shared hash table, calculate the offset at which to find the
643+
* first partition of elements. We have to skip space for the header,
644+
* segments and buckets.
645+
*/
646+
if (hashp->isshared)
647+
ptr = HASH_ELEMENTS_PTR(hashp, hctl->nsegs);
648+
610649
for (i = 0; i < freelist_partitions; i++)
611650
{
612651
int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
613652

614-
if (!element_alloc(hashp, temp, i))
615-
ereport(ERROR,
616-
(errcode(ERRCODE_OUT_OF_MEMORY),
617-
errmsg("out of memory")));
653+
/*
654+
* Assign the correct location of each parition within a
655+
* pre-allocated buffer.
656+
*
657+
* Actual memory allocation happens in ShmemInitHash for shared
658+
* hash tables.
659+
*
660+
* We just need to split that allocation into per-batch freelists.
661+
*/
662+
if (hashp->isshared)
663+
{
664+
element_add(hashp, (HASHELEMENT *) ptr, temp, i);
665+
ptr = HASH_ELEMENT_NEXT(hctl, temp, ptr);
666+
}
667+
else
668+
{
669+
HASHELEMENT *firstElement = element_alloc(hashp, temp);
670+
671+
if (!firstElement)
672+
ereport(ERROR,
673+
(errcode(ERRCODE_OUT_OF_MEMORY),
674+
errmsg("out of memory")));
675+
element_add(hashp, firstElement, temp, i);
676+
}
618677
}
619678
}
620679

@@ -703,29 +762,16 @@ init_htab(HTAB *hashp, long nelem)
703762
SpinLockInit(&(hctl->freeList[i].mutex));
704763

705764
/*
706-
* Allocate space for the next greater power of two number of buckets,
707-
* assuming a desired maximum load factor of 1.
765+
* We've already calculated these parameters when we calculated how much
766+
* space to allocate in hash_get_shared_size(). Be careful to keep these
767+
* two places in sync, so that we get the same parameters.
708768
*/
709-
nbuckets = next_pow2_int(nelem);
710-
711-
/*
712-
* In a partitioned table, nbuckets must be at least equal to
713-
* num_partitions; were it less, keys with apparently different partition
714-
* numbers would map to the same bucket, breaking partition independence.
715-
* (Normally nbuckets will be much bigger; this is just a safety check.)
716-
*/
717-
while (nbuckets < hctl->num_partitions)
718-
nbuckets <<= 1;
769+
compute_buckets_and_segs(nelem, hctl->num_partitions, hctl->ssize,
770+
&nbuckets, &nsegs);
719771

720772
hctl->max_bucket = hctl->low_mask = nbuckets - 1;
721773
hctl->high_mask = (nbuckets << 1) - 1;
722774

723-
/*
724-
* Figure number of directory segments needed, round up to a power of 2
725-
*/
726-
nsegs = (nbuckets - 1) / hctl->ssize + 1;
727-
nsegs = next_pow2_int(nsegs);
728-
729775
/*
730776
* Make sure directory is big enough. If pre-allocated directory is too
731777
* small, choke (caller screwed up).
@@ -749,12 +795,22 @@ init_htab(HTAB *hashp, long nelem)
749795
}
750796

751797
/* Allocate initial segments */
798+
i = 0;
752799
for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
753800
{
754-
*segp = seg_alloc(hashp);
755-
if (*segp == NULL)
756-
return false;
801+
/* Assign initial segments, which are also pre-allocated */
802+
if (hashp->isshared)
803+
{
804+
*segp = (HASHSEGMENT) HASH_SEGMENT_PTR(hashp, i++);
805+
MemSet(*segp, 0, HASH_SEGMENT_SIZE(hashp));
806+
}
807+
else
808+
{
809+
*segp = seg_alloc(hashp);
810+
i++;
811+
}
757812
}
813+
Assert(i == nsegs);
758814

759815
/* Choose number of entries to allocate at a time */
760816
hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
@@ -847,16 +903,60 @@ hash_select_dirsize(long num_entries)
847903
}
848904

849905
/*
850-
* Compute the required initial memory allocation for a shared-memory
851-
* hashtable with the given parameters. We need space for the HASHHDR
852-
* and for the (non expansible) directory.
906+
* hash_get_shared_size -- determine memory needed for a new shared dynamic hash table
907+
*
908+
* info: hash table parameters
909+
* flags: bitmask indicating which parameters to take from *info
910+
* nelem: maximum number of elements expected
911+
*
912+
* Compute the required initial memory allocation for a hashtable with the given
913+
* parameters. We need space for the HASHHDR, for the directory, segments and
914+
* preallocated elements.
915+
*
916+
* For shared hash tables the directory size is non-expansive, and we preallocate
917+
* all elements (nelem).
853918
*/
854919
Size
855-
hash_get_shared_size(HASHCTL *info, int flags)
920+
hash_get_shared_size(const HASHCTL *info, int flags, long nelem)
856921
{
922+
int nbuckets;
923+
int nsegs;
924+
int num_partitions;
925+
long ssize;
926+
long dsize;
927+
Size elementSize = HASH_ELEMENT_SIZE(info);
928+
929+
#ifdef USE_ASSERT_CHECKING
930+
/* shared hash tables have non-expansive directory */
931+
Assert(flags & HASH_SHARED_MEM);
857932
Assert(flags & HASH_DIRSIZE);
858933
Assert(info->dsize == info->max_dsize);
859-
return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
934+
#endif
935+
936+
dsize = info->dsize;
937+
938+
if (flags & HASH_SEGMENT)
939+
ssize = info->ssize;
940+
else
941+
ssize = DEF_SEGSIZE;
942+
943+
if (flags & HASH_PARTITION)
944+
{
945+
num_partitions = info->num_partitions;
946+
947+
/* Number of entries should be atleast equal to the freelists */
948+
if (nelem < NUM_FREELISTS)
949+
nelem = NUM_FREELISTS;
950+
}
951+
else
952+
num_partitions = 0;
953+
954+
compute_buckets_and_segs(nelem, num_partitions, ssize,
955+
&nbuckets, &nsegs);
956+
957+
return sizeof(HASHHDR) + dsize * sizeof(HASHSEGMENT)
958+
+ sizeof(HASHBUCKET) * ssize * nsegs
959+
+ nelem * elementSize;
860960
}
861961

862962

@@ -1286,7 +1386,7 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12861386
* Failing because the needed element is in a different freelist is
12871387
* not acceptable.
12881388
*/
1289-
if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1389+
if ((newElement = element_alloc(hashp, hctl->nelem_alloc)) == NULL)
12901390
{
12911391
int borrow_from_idx;
12921392

@@ -1323,6 +1423,7 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
13231423
/* no elements available to borrow either, so out of memory */
13241424
return NULL;
13251425
}
1426+
element_add(hashp, newElement, hctl->nelem_alloc, freelist_idx);
13261427
}
13271428

13281429
/* remove entry from freelist, bump nentries */
@@ -1701,29 +1802,43 @@ seg_alloc(HTAB *hashp)
17011802
}
17021803

17031804
/*
1704-
* allocate some new elements and link them into the indicated free list
1805+
* allocate some new elements
17051806
*/
1706-
static bool
1707-
element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1807+
static HASHELEMENT *
1808+
element_alloc(HTAB *hashp, int nelem)
17081809
{
17091810
HASHHDR *hctl = hashp->hctl;
17101811
Size elementSize;
1711-
HASHELEMENT *firstElement;
1712-
HASHELEMENT *tmpElement;
1713-
HASHELEMENT *prevElement;
1714-
int i;
1812+
HASHELEMENT *firstElement = NULL;
17151813

17161814
if (hashp->isfixed)
1717-
return false;
1815+
return NULL;
17181816

17191817
/* Each element has a HASHELEMENT header plus user data. */
1720-
elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1721-
1818+
elementSize = HASH_ELEMENT_SIZE(hctl);
17221819
CurrentDynaHashCxt = hashp->hcxt;
17231820
firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
17241821

17251822
if (!firstElement)
1726-
return false;
1823+
return NULL;
1824+
1825+
return firstElement;
1826+
}
1827+
1828+
/*
1829+
* link the elements allocated by element_alloc into the indicated free list
1830+
*/
1831+
static void
1832+
element_add(HTAB *hashp, HASHELEMENT *firstElement, int nelem, int freelist_idx)
1833+
{
1834+
HASHHDR *hctl = hashp->hctl;
1835+
Size elementSize;
1836+
HASHELEMENT *tmpElement;
1837+
HASHELEMENT *prevElement;
1838+
int i;
1839+
1840+
/* Each element has a HASHELEMENT header plus user data. */
1841+
elementSize = HASH_ELEMENT_SIZE(hctl);
17271842

17281843
/* prepare to link all the new entries into the freelist */
17291844
prevElement = NULL;
@@ -1745,8 +1860,6 @@ element_alloc(HTAB *hashp, int nelem, int freelist_idx)
17451860

17461861
if (IS_PARTITIONED(hctl))
17471862
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1748-
1749-
return true;
17501863
}
17511864

17521865
/*
@@ -1958,3 +2071,34 @@ AtEOSubXact_HashTables(bool isCommit, int nestDepth)
19582071
}
19592072
}
19602073
}
2074+
2075+
/*
2076+
* Calculate the number of buckets and segments to store the given
2077+
* number of elements in a hash table. Segments contain buckets which
2078+
* in turn contain elements.
2079+
*/
2080+
static void
2081+
compute_buckets_and_segs(long nelem, long num_partitions, long ssize,
2082+
int *nbuckets, int *nsegments)
2083+
{
2084+
/*
2085+
* Allocate space for the next greater power of two number of buckets,
2086+
* assuming a desired maximum load factor of 1.
2087+
*/
2088+
*nbuckets = next_pow2_int(nelem);
2089+
2090+
/*
2091+
* In a partitioned table, nbuckets must be at least equal to
2092+
* num_partitions; were it less, keys with apparently different partition
2093+
* numbers would map to the same bucket, breaking partition independence.
2094+
* (Normally nbuckets will be much bigger; this is just a safety check.)
2095+
*/
2096+
while ((*nbuckets) < num_partitions)
2097+
(*nbuckets) <<= 1;
2098+
2099+
/*
2100+
* Figure number of directory segments needed, round up to a power of 2
2101+
*/
2102+
*nsegments = ((*nbuckets) - 1) / ssize + 1;
2103+
*nsegments = next_pow2_int(*nsegments);
2104+
}

src/include/utils/hsearch.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,8 @@ extern void hash_seq_term(HASH_SEQ_STATUS *status);
151151
extern void hash_freeze(HTAB *hashp);
152152
extern Size hash_estimate_size(long num_entries, Size entrysize);
153153
extern long hash_select_dirsize(long num_entries);
154-
extern Size hash_get_shared_size(HASHCTL *info, int flags);
154+
extern Size hash_get_shared_size(const HASHCTL *info, int flags,
155+
long nelem);
155156
extern void AtEOXact_HashTables(bool isCommit);
156157
extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);
157158

0 commit comments

Comments
 (0)