15
15
* to hash_create. This prevents any attempt to split buckets on-the-fly.
16
16
* Therefore, each hash bucket chain operates independently, and no fields
17
17
* of the hash header change after init except nentries and freeList.
18
- * A partitioned table uses spinlocks to guard changes of those fields.
18
+ * (A partitioned table uses multiple copies of those fields, guarded by
19
+ * spinlocks, for additional concurrency.)
19
20
* This lets any subset of the hash buckets be treated as a separately
20
21
* lockable partition. We expect callers to use the low-order bits of a
21
22
* lookup key's hash value as a partition number --- this will work because
@@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET;
121
122
typedef HASHBUCKET * HASHSEGMENT ;
122
123
123
124
/*
124
- * Using array of FreeListData instead of separate arrays of mutexes, nentries
125
- * and freeLists prevents, at least partially, sharing one cache line between
126
- * different mutexes (see below).
125
+ * Per-freelist data.
126
+ *
127
+ * In a partitioned hash table, each freelist is associated with a specific
128
+ * set of hashcodes, as determined by the FREELIST_IDX() macro below.
129
+ * nentries tracks the number of live hashtable entries having those hashcodes
130
+ * (NOT the number of entries in the freelist, as you might expect).
131
+ *
132
+ * The coverage of a freelist might be more or less than one partition, so it
133
+ * needs its own lock rather than relying on caller locking. Relying on that
134
+ * wouldn't work even if the coverage was the same, because of the occasional
135
+ * need to "borrow" entries from another freelist; see get_hash_entry().
136
+ *
137
+ * Using an array of FreeListData instead of separate arrays of mutexes,
138
+ * nentries and freeLists helps to reduce sharing of cache lines between
139
+ * different mutexes.
127
140
*/
128
141
typedef struct
129
142
{
130
- slock_t mutex ; /* spinlock */
131
- long nentries ; /* number of entries */
132
- HASHELEMENT * freeList ; /* list of free elements */
143
+ slock_t mutex ; /* spinlock for this freelist */
144
+ long nentries ; /* number of entries in associated buckets */
145
+ HASHELEMENT * freeList ; /* chain of free elements */
133
146
} FreeListData ;
134
147
135
148
/*
@@ -143,12 +156,14 @@ typedef struct
143
156
struct HASHHDR
144
157
{
145
158
/*
146
- * The freelist can become a point of contention on high-concurrency hash
147
- * tables, so we use an array of freelist, each with its own mutex and
148
- * nentries count, instead of just a single one.
159
+ * The freelist can become a point of contention in high-concurrency hash
160
+ * tables, so we use an array of freelists, each with its own mutex and
161
+ * nentries count, instead of just a single one. Although the freelists
162
+ * normally operate independently, we will scavenge entries from freelists
163
+ * other than a hashcode's default freelist when necessary.
149
164
*
150
- * If hash table is not partitioned only freeList[0] is used and spinlocks
151
- * are not used at all.
165
+ * If the hash table is not partitioned, only freeList[0] is used and its
166
+ * spinlock is not used at all; callers' locking is assumed sufficient .
152
167
*/
153
168
FreeListData freeList [NUM_FREELISTS ];
154
169
@@ -184,7 +199,7 @@ struct HASHHDR
184
199
#define IS_PARTITIONED (hctl ) ((hctl)->num_partitions != 0)
185
200
186
201
#define FREELIST_IDX (hctl , hashcode ) \
187
- (IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
202
+ (IS_PARTITIONED(hctl) ? ( hashcode) % NUM_FREELISTS : 0)
188
203
189
204
/*
190
205
* Top control structure for a hashtable --- in a shared table, each backend
@@ -506,19 +521,22 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
506
521
nelem_alloc_first ;
507
522
508
523
/*
509
- * If hash table is partitioned all freeLists have equal number of
510
- * elements. Otherwise only freeList[0] is used.
524
+ * If hash table is partitioned, give each freelist an equal share of
525
+ * the initial allocation. Otherwise only freeList[0] is used.
511
526
*/
512
527
if (IS_PARTITIONED (hashp -> hctl ))
513
528
freelist_partitions = NUM_FREELISTS ;
514
529
else
515
530
freelist_partitions = 1 ;
516
531
517
532
nelem_alloc = nelem / freelist_partitions ;
518
- if (nelem_alloc = = 0 )
533
+ if (nelem_alloc < = 0 )
519
534
nelem_alloc = 1 ;
520
535
521
- /* Make sure all memory will be used */
536
+ /*
537
+ * Make sure we'll allocate all the requested elements; freeList[0]
538
+ * gets the excess if the request isn't divisible by NUM_FREELISTS.
539
+ */
522
540
if (nelem_alloc * freelist_partitions < nelem )
523
541
nelem_alloc_first =
524
542
nelem - nelem_alloc * (freelist_partitions - 1 );
@@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem)
620
638
int i ;
621
639
622
640
/*
623
- * initialize mutex if it's a partitioned table
641
+ * initialize mutexes if it's a partitioned table
624
642
*/
625
643
if (IS_PARTITIONED (hctl ))
626
644
for (i = 0 ; i < NUM_FREELISTS ; i ++ )
@@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp,
902
920
bool * foundPtr )
903
921
{
904
922
HASHHDR * hctl = hashp -> hctl ;
923
+ int freelist_idx = FREELIST_IDX (hctl , hashvalue );
905
924
Size keysize ;
906
925
uint32 bucket ;
907
926
long segment_num ;
@@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp,
910
929
HASHBUCKET currBucket ;
911
930
HASHBUCKET * prevBucketPtr ;
912
931
HashCompareFunc match ;
913
- int freelist_idx = FREELIST_IDX (hctl , hashvalue );
914
932
915
933
#if HASH_STATISTICS
916
934
hash_accesses ++ ;
@@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp,
993
1011
if (IS_PARTITIONED (hctl ))
994
1012
SpinLockAcquire (& (hctl -> freeList [freelist_idx ].mutex ));
995
1013
1014
+ /* delete the record from the appropriate nentries counter. */
996
1015
Assert (hctl -> freeList [freelist_idx ].nentries > 0 );
997
1016
hctl -> freeList [freelist_idx ].nentries -- ;
998
1017
999
1018
/* remove record from hash bucket's chain. */
1000
1019
* prevBucketPtr = currBucket -> link ;
1001
1020
1002
- /* add the record to the freelist for this table. */
1021
+ /* add the record to the appropriate freelist. */
1003
1022
currBucket -> link = hctl -> freeList [freelist_idx ].freeList ;
1004
1023
hctl -> freeList [freelist_idx ].freeList = currBucket ;
1005
1024
@@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp,
1220
1239
}
1221
1240
1222
1241
/*
1223
- * create a new entry if possible
1242
+ * Allocate a new hashtable entry if possible; return NULL if out of memory.
1243
+ * (Or, if the underlying space allocator throws error for out-of-memory,
1244
+ * we won't return at all.)
1224
1245
*/
1225
1246
static HASHBUCKET
1226
1247
get_hash_entry (HTAB * hashp , int freelist_idx )
1227
1248
{
1228
1249
HASHHDR * hctl = hashp -> hctl ;
1229
1250
HASHBUCKET newElement ;
1230
- int borrow_from_idx ;
1231
1251
1232
1252
for (;;)
1233
1253
{
@@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
1244
1264
if (IS_PARTITIONED (hctl ))
1245
1265
SpinLockRelease (& hctl -> freeList [freelist_idx ].mutex );
1246
1266
1247
- /* no free elements. allocate another chunk of buckets */
1267
+ /*
1268
+ * No free elements in this freelist. In a partitioned table, there
1269
+ * might be entries in other freelists, but to reduce contention we
1270
+ * prefer to first try to get another chunk of buckets from the main
1271
+ * shmem allocator. If that fails, though, we *MUST* root through all
1272
+ * the other freelists before giving up. There are multiple callers
1273
+ * that assume that they can allocate every element in the initially
1274
+ * requested table size, or that deleting an element guarantees they
1275
+ * can insert a new element, even if shared memory is entirely full.
1276
+ * Failing because the needed element is in a different freelist is
1277
+ * not acceptable.
1278
+ */
1248
1279
if (!element_alloc (hashp , hctl -> nelem_alloc , freelist_idx ))
1249
1280
{
1281
+ int borrow_from_idx ;
1282
+
1250
1283
if (!IS_PARTITIONED (hctl ))
1251
1284
return NULL ; /* out of memory */
1252
1285
1253
- /* try to borrow element from another partition */
1286
+ /* try to borrow element from another freelist */
1254
1287
borrow_from_idx = freelist_idx ;
1255
1288
for (;;)
1256
1289
{
1257
1290
borrow_from_idx = (borrow_from_idx + 1 ) % NUM_FREELISTS ;
1258
1291
if (borrow_from_idx == freelist_idx )
1259
- break ;
1292
+ break ; /* examined all freelists, fail */
1260
1293
1261
1294
SpinLockAcquire (& (hctl -> freeList [borrow_from_idx ].mutex ));
1262
1295
newElement = hctl -> freeList [borrow_from_idx ].freeList ;
@@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
1266
1299
hctl -> freeList [borrow_from_idx ].freeList = newElement -> link ;
1267
1300
SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
1268
1301
1302
+ /* careful: count the new element in its proper freelist */
1269
1303
SpinLockAcquire (& hctl -> freeList [freelist_idx ].mutex );
1270
1304
hctl -> freeList [freelist_idx ].nentries ++ ;
1271
1305
SpinLockRelease (& hctl -> freeList [freelist_idx ].mutex );
1272
1306
1273
- break ;
1307
+ return newElement ;
1274
1308
}
1275
1309
1276
1310
SpinLockRelease (& (hctl -> freeList [borrow_from_idx ].mutex ));
1277
1311
}
1278
1312
1279
- return newElement ;
1313
+ /* no elements available to borrow either, so out of memory */
1314
+ return NULL ;
1280
1315
}
1281
1316
}
1282
1317
@@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp)
1300
1335
long sum = hashp -> hctl -> freeList [0 ].nentries ;
1301
1336
1302
1337
/*
1303
- * We currently don't bother with the mutex; it's only sensible to call
1304
- * this function if you've got lock on all partitions of the table.
1338
+ * We currently don't bother with acquiring the mutexes; it's only
1339
+ * sensible to call this function if you've got lock on all partitions of
1340
+ * the table.
1305
1341
*/
1306
-
1307
- if (!IS_PARTITIONED (hashp -> hctl ))
1308
- return sum ;
1309
-
1310
- for (i = 1 ; i < NUM_FREELISTS ; i ++ )
1311
- sum += hashp -> hctl -> freeList [i ].nentries ;
1342
+ if (IS_PARTITIONED (hashp -> hctl ))
1343
+ {
1344
+ for (i = 1 ; i < NUM_FREELISTS ; i ++ )
1345
+ sum += hashp -> hctl -> freeList [i ].nentries ;
1346
+ }
1312
1347
1313
1348
return sum ;
1314
1349
}
0 commit comments