Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit ab2324f

Browse files
committed
Improve comments about partitioned hash table freelists.
While I couldn't find any live bugs in commit 44ca402, the comments seemed pretty far from adequate; in particular it was not made plain that "borrowing" entries from other freelists is critical for correctness. Try to improve the commentary. A couple of very minor code style tweaks, as well. Discussion: https://postgr.es/m/10593.1500670709@sss.pgh.pa.us
1 parent 991c8b0 commit ab2324f

File tree

1 file changed

+70
-35
lines changed

1 file changed

+70
-35
lines changed

src/backend/utils/hash/dynahash.c

+70-35
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
* to hash_create. This prevents any attempt to split buckets on-the-fly.
1616
* Therefore, each hash bucket chain operates independently, and no fields
1717
* of the hash header change after init except nentries and freeList.
18-
* A partitioned table uses spinlocks to guard changes of those fields.
18+
* (A partitioned table uses multiple copies of those fields, guarded by
19+
* spinlocks, for additional concurrency.)
1920
* This lets any subset of the hash buckets be treated as a separately
2021
* lockable partition. We expect callers to use the low-order bits of a
2122
* lookup key's hash value as a partition number --- this will work because
@@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET;
121122
typedef HASHBUCKET *HASHSEGMENT;
122123

123124
/*
124-
* Using array of FreeListData instead of separate arrays of mutexes, nentries
125-
* and freeLists prevents, at least partially, sharing one cache line between
126-
* different mutexes (see below).
125+
* Per-freelist data.
126+
*
127+
* In a partitioned hash table, each freelist is associated with a specific
128+
* set of hashcodes, as determined by the FREELIST_IDX() macro below.
129+
* nentries tracks the number of live hashtable entries having those hashcodes
130+
* (NOT the number of entries in the freelist, as you might expect).
131+
*
132+
* The coverage of a freelist might be more or less than one partition, so it
133+
* needs its own lock rather than relying on caller locking. Relying on that
134+
* wouldn't work even if the coverage was the same, because of the occasional
135+
* need to "borrow" entries from another freelist; see get_hash_entry().
136+
*
137+
* Using an array of FreeListData instead of separate arrays of mutexes,
138+
* nentries and freeLists helps to reduce sharing of cache lines between
139+
* different mutexes.
127140
*/
128141
typedef struct
129142
{
130-
slock_t mutex; /* spinlock */
131-
long nentries; /* number of entries */
132-
HASHELEMENT *freeList; /* list of free elements */
143+
slock_t mutex; /* spinlock for this freelist */
144+
long nentries; /* number of entries in associated buckets */
145+
HASHELEMENT *freeList; /* chain of free elements */
133146
} FreeListData;
134147

135148
/*
@@ -143,12 +156,14 @@ typedef struct
143156
struct HASHHDR
144157
{
145158
/*
146-
* The freelist can become a point of contention on high-concurrency hash
147-
* tables, so we use an array of freelist, each with its own mutex and
148-
* nentries count, instead of just a single one.
159+
* The freelist can become a point of contention in high-concurrency hash
160+
* tables, so we use an array of freelists, each with its own mutex and
161+
* nentries count, instead of just a single one. Although the freelists
162+
* normally operate independently, we will scavenge entries from freelists
163+
* other than a hashcode's default freelist when necessary.
149164
*
150-
* If hash table is not partitioned only freeList[0] is used and spinlocks
151-
* are not used at all.
165+
* If the hash table is not partitioned, only freeList[0] is used and its
166+
* spinlock is not used at all; callers' locking is assumed sufficient.
152167
*/
153168
FreeListData freeList[NUM_FREELISTS];
154169

@@ -184,7 +199,7 @@ struct HASHHDR
184199
#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
185200

186201
#define FREELIST_IDX(hctl, hashcode) \
187-
(IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
202+
(IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
188203

189204
/*
190205
* Top control structure for a hashtable --- in a shared table, each backend
@@ -506,19 +521,22 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
506521
nelem_alloc_first;
507522

508523
/*
509-
* If hash table is partitioned all freeLists have equal number of
510-
* elements. Otherwise only freeList[0] is used.
524+
* If hash table is partitioned, give each freelist an equal share of
525+
* the initial allocation. Otherwise only freeList[0] is used.
511526
*/
512527
if (IS_PARTITIONED(hashp->hctl))
513528
freelist_partitions = NUM_FREELISTS;
514529
else
515530
freelist_partitions = 1;
516531

517532
nelem_alloc = nelem / freelist_partitions;
518-
if (nelem_alloc == 0)
533+
if (nelem_alloc <= 0)
519534
nelem_alloc = 1;
520535

521-
/* Make sure all memory will be used */
536+
/*
537+
* Make sure we'll allocate all the requested elements; freeList[0]
538+
* gets the excess if the request isn't divisible by NUM_FREELISTS.
539+
*/
522540
if (nelem_alloc * freelist_partitions < nelem)
523541
nelem_alloc_first =
524542
nelem - nelem_alloc * (freelist_partitions - 1);
@@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem)
620638
int i;
621639

622640
/*
623-
* initialize mutex if it's a partitioned table
641+
* initialize mutexes if it's a partitioned table
624642
*/
625643
if (IS_PARTITIONED(hctl))
626644
for (i = 0; i < NUM_FREELISTS; i++)
@@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp,
902920
bool *foundPtr)
903921
{
904922
HASHHDR *hctl = hashp->hctl;
923+
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
905924
Size keysize;
906925
uint32 bucket;
907926
long segment_num;
@@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp,
910929
HASHBUCKET currBucket;
911930
HASHBUCKET *prevBucketPtr;
912931
HashCompareFunc match;
913-
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
914932

915933
#if HASH_STATISTICS
916934
hash_accesses++;
@@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp,
9931011
if (IS_PARTITIONED(hctl))
9941012
SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
9951013

1014+
/* delete the record from the appropriate nentries counter. */
9961015
Assert(hctl->freeList[freelist_idx].nentries > 0);
9971016
hctl->freeList[freelist_idx].nentries--;
9981017

9991018
/* remove record from hash bucket's chain. */
10001019
*prevBucketPtr = currBucket->link;
10011020

1002-
/* add the record to the freelist for this table. */
1021+
/* add the record to the appropriate freelist. */
10031022
currBucket->link = hctl->freeList[freelist_idx].freeList;
10041023
hctl->freeList[freelist_idx].freeList = currBucket;
10051024

@@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp,
12201239
}
12211240

12221241
/*
1223-
* create a new entry if possible
1242+
* Allocate a new hashtable entry if possible; return NULL if out of memory.
1243+
* (Or, if the underlying space allocator throws error for out-of-memory,
1244+
* we won't return at all.)
12241245
*/
12251246
static HASHBUCKET
12261247
get_hash_entry(HTAB *hashp, int freelist_idx)
12271248
{
12281249
HASHHDR *hctl = hashp->hctl;
12291250
HASHBUCKET newElement;
1230-
int borrow_from_idx;
12311251

12321252
for (;;)
12331253
{
@@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12441264
if (IS_PARTITIONED(hctl))
12451265
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
12461266

1247-
/* no free elements. allocate another chunk of buckets */
1267+
/*
1268+
* No free elements in this freelist. In a partitioned table, there
1269+
* might be entries in other freelists, but to reduce contention we
1270+
* prefer to first try to get another chunk of buckets from the main
1271+
* shmem allocator. If that fails, though, we *MUST* root through all
1272+
* the other freelists before giving up. There are multiple callers
1273+
* that assume that they can allocate every element in the initially
1274+
* requested table size, or that deleting an element guarantees they
1275+
* can insert a new element, even if shared memory is entirely full.
1276+
* Failing because the needed element is in a different freelist is
1277+
* not acceptable.
1278+
*/
12481279
if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
12491280
{
1281+
int borrow_from_idx;
1282+
12501283
if (!IS_PARTITIONED(hctl))
12511284
return NULL; /* out of memory */
12521285

1253-
/* try to borrow element from another partition */
1286+
/* try to borrow element from another freelist */
12541287
borrow_from_idx = freelist_idx;
12551288
for (;;)
12561289
{
12571290
borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
12581291
if (borrow_from_idx == freelist_idx)
1259-
break;
1292+
break; /* examined all freelists, fail */
12601293

12611294
SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
12621295
newElement = hctl->freeList[borrow_from_idx].freeList;
@@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
12661299
hctl->freeList[borrow_from_idx].freeList = newElement->link;
12671300
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
12681301

1302+
/* careful: count the new element in its proper freelist */
12691303
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
12701304
hctl->freeList[freelist_idx].nentries++;
12711305
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
12721306

1273-
break;
1307+
return newElement;
12741308
}
12751309

12761310
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
12771311
}
12781312

1279-
return newElement;
1313+
/* no elements available to borrow either, so out of memory */
1314+
return NULL;
12801315
}
12811316
}
12821317

@@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp)
13001335
long sum = hashp->hctl->freeList[0].nentries;
13011336

13021337
/*
1303-
* We currently don't bother with the mutex; it's only sensible to call
1304-
* this function if you've got lock on all partitions of the table.
1338+
* We currently don't bother with acquiring the mutexes; it's only
1339+
* sensible to call this function if you've got lock on all partitions of
1340+
* the table.
13051341
*/
1306-
1307-
if (!IS_PARTITIONED(hashp->hctl))
1308-
return sum;
1309-
1310-
for (i = 1; i < NUM_FREELISTS; i++)
1311-
sum += hashp->hctl->freeList[i].nentries;
1342+
if (IS_PARTITIONED(hashp->hctl))
1343+
{
1344+
for (i = 1; i < NUM_FREELISTS; i++)
1345+
sum += hashp->hctl->freeList[i].nentries;
1346+
}
13121347

13131348
return sum;
13141349
}

0 commit comments

Comments
 (0)