208
208
*
209
209
* Spilled data is written to logical tapes. These provide better control
210
210
* over memory usage, disk space, and the number of files than if we were
211
- * to use a BufFile for each spill.
211
+ * to use a BufFile for each spill. We don't know the number of tapes needed
212
+ * at the start of the algorithm (because it can recurse), so a tape set is
213
+ * allocated at the beginning, and individual tapes are created as needed.
214
+ * As a particular tape is read, logtape.c recycles its disk space. When a
215
+ * tape is read to completion, it is destroyed entirely.
216
+ *
217
+ * Tapes' buffers can take up substantial memory when many tapes are open at
218
+ * once. We only need one tape open at a time in read mode (using a buffer
219
+ * that's a multiple of BLCKSZ); but we need one tape open in write mode (each
220
+ * requiring a buffer of size BLCKSZ) for each partition.
212
221
*
213
222
* Note that it's possible for transition states to start small but then
214
223
* grow very large; for instance in the case of ARRAY_AGG. In such cases,
311
320
*/
312
321
#define CHUNKHDRSZ 16
313
322
314
- /*
315
- * Track all tapes needed for a HashAgg that spills. We don't know the maximum
316
- * number of tapes needed at the start of the algorithm (because it can
317
- * recurse), so one tape set is allocated and extended as needed for new
318
- * tapes. When a particular tape is already read, rewind it for write mode and
319
- * put it in the free list.
320
- *
321
- * Tapes' buffers can take up substantial memory when many tapes are open at
322
- * once. We only need one tape open at a time in read mode (using a buffer
323
- * that's a multiple of BLCKSZ); but we need one tape open in write mode (each
324
- * requiring a buffer of size BLCKSZ) for each partition.
325
- */
326
- typedef struct HashTapeInfo
327
- {
328
- LogicalTapeSet * tapeset ;
329
- int ntapes ;
330
- int * freetapes ;
331
- int nfreetapes ;
332
- int freetapes_alloc ;
333
- } HashTapeInfo ;
334
-
335
323
/*
336
324
* Represents partitioned spill data for a single hashtable. Contains the
337
325
* necessary information to route tuples to the correct partition, and to
@@ -343,9 +331,8 @@ typedef struct HashTapeInfo
343
331
*/
344
332
typedef struct HashAggSpill
345
333
{
346
- LogicalTapeSet * tapeset ; /* borrowed reference to tape set */
347
334
int npartitions ; /* number of partitions */
348
- int * partitions ; /* spill partition tape numbers */
335
+ LogicalTape * * partitions ; /* spill partition tapes */
349
336
int64 * ntuples ; /* number of tuples in each partition */
350
337
uint32 mask ; /* mask to find partition from hash value */
351
338
int shift ; /* after masking, shift by this amount */
@@ -365,8 +352,7 @@ typedef struct HashAggBatch
365
352
{
366
353
int setno ; /* grouping set */
367
354
int used_bits ; /* number of bits of hash already used */
368
- LogicalTapeSet * tapeset ; /* borrowed reference to tape set */
369
- int input_tapenum ; /* input partition tape */
355
+ LogicalTape * input_tape ; /* input partition tape */
370
356
int64 input_tuples ; /* number of tuples in this batch */
371
357
double input_card ; /* estimated group cardinality */
372
358
} HashAggBatch ;
@@ -442,22 +428,17 @@ static void hash_agg_update_metrics(AggState *aggstate, bool from_tape,
442
428
int npartitions );
443
429
static void hashagg_finish_initial_spills (AggState * aggstate );
444
430
static void hashagg_reset_spill_state (AggState * aggstate );
445
- static HashAggBatch * hashagg_batch_new (LogicalTapeSet * tapeset ,
446
- int input_tapenum , int setno ,
431
+ static HashAggBatch * hashagg_batch_new (LogicalTape * input_tape , int setno ,
447
432
int64 input_tuples , double input_card ,
448
433
int used_bits );
449
434
static MinimalTuple hashagg_batch_read (HashAggBatch * batch , uint32 * hashp );
450
- static void hashagg_spill_init (HashAggSpill * spill , HashTapeInfo * tapeinfo ,
435
+ static void hashagg_spill_init (HashAggSpill * spill , LogicalTapeSet * lts ,
451
436
int used_bits , double input_groups ,
452
437
double hashentrysize );
453
438
static Size hashagg_spill_tuple (AggState * aggstate , HashAggSpill * spill ,
454
439
TupleTableSlot * slot , uint32 hash );
455
440
static void hashagg_spill_finish (AggState * aggstate , HashAggSpill * spill ,
456
441
int setno );
457
- static void hashagg_tapeinfo_init (AggState * aggstate );
458
- static void hashagg_tapeinfo_assign (HashTapeInfo * tapeinfo , int * dest ,
459
- int ndest );
460
- static void hashagg_tapeinfo_release (HashTapeInfo * tapeinfo , int tapenum );
461
442
static Datum GetAggInitVal (Datum textInitVal , Oid transtype );
462
443
static void build_pertrans_for_aggref (AggStatePerTrans pertrans ,
463
444
AggState * aggstate , EState * estate ,
@@ -1887,12 +1868,12 @@ hash_agg_enter_spill_mode(AggState *aggstate)
1887
1868
1888
1869
if (!aggstate -> hash_ever_spilled )
1889
1870
{
1890
- Assert (aggstate -> hash_tapeinfo == NULL );
1871
+ Assert (aggstate -> hash_tapeset == NULL );
1891
1872
Assert (aggstate -> hash_spills == NULL );
1892
1873
1893
1874
aggstate -> hash_ever_spilled = true;
1894
1875
1895
- hashagg_tapeinfo_init ( aggstate );
1876
+ aggstate -> hash_tapeset = LogicalTapeSetCreate (true, NULL , -1 );
1896
1877
1897
1878
aggstate -> hash_spills = palloc (sizeof (HashAggSpill ) * aggstate -> num_hashes );
1898
1879
@@ -1901,7 +1882,7 @@ hash_agg_enter_spill_mode(AggState *aggstate)
1901
1882
AggStatePerHash perhash = & aggstate -> perhash [setno ];
1902
1883
HashAggSpill * spill = & aggstate -> hash_spills [setno ];
1903
1884
1904
- hashagg_spill_init (spill , aggstate -> hash_tapeinfo , 0 ,
1885
+ hashagg_spill_init (spill , aggstate -> hash_tapeset , 0 ,
1905
1886
perhash -> aggnode -> numGroups ,
1906
1887
aggstate -> hashentrysize );
1907
1888
}
@@ -1943,9 +1924,9 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
1943
1924
aggstate -> hash_mem_peak = total_mem ;
1944
1925
1945
1926
/* update disk usage */
1946
- if (aggstate -> hash_tapeinfo != NULL )
1927
+ if (aggstate -> hash_tapeset != NULL )
1947
1928
{
1948
- uint64 disk_used = LogicalTapeSetBlocks (aggstate -> hash_tapeinfo -> tapeset ) * (BLCKSZ / 1024 );
1929
+ uint64 disk_used = LogicalTapeSetBlocks (aggstate -> hash_tapeset ) * (BLCKSZ / 1024 );
1949
1930
1950
1931
if (aggstate -> hash_disk_used < disk_used )
1951
1932
aggstate -> hash_disk_used = disk_used ;
@@ -2132,7 +2113,7 @@ lookup_hash_entries(AggState *aggstate)
2132
2113
TupleTableSlot * slot = aggstate -> tmpcontext -> ecxt_outertuple ;
2133
2114
2134
2115
if (spill -> partitions == NULL )
2135
- hashagg_spill_init (spill , aggstate -> hash_tapeinfo , 0 ,
2116
+ hashagg_spill_init (spill , aggstate -> hash_tapeset , 0 ,
2136
2117
perhash -> aggnode -> numGroups ,
2137
2118
aggstate -> hashentrysize );
2138
2119
@@ -2597,7 +2578,7 @@ agg_refill_hash_table(AggState *aggstate)
2597
2578
HashAggBatch * batch ;
2598
2579
AggStatePerHash perhash ;
2599
2580
HashAggSpill spill ;
2600
- HashTapeInfo * tapeinfo = aggstate -> hash_tapeinfo ;
2581
+ LogicalTapeSet * tapeset = aggstate -> hash_tapeset ;
2601
2582
bool spill_initialized = false;
2602
2583
2603
2584
if (aggstate -> hash_batches == NIL )
@@ -2693,7 +2674,7 @@ agg_refill_hash_table(AggState *aggstate)
2693
2674
* that we don't assign tapes that will never be used.
2694
2675
*/
2695
2676
spill_initialized = true;
2696
- hashagg_spill_init (& spill , tapeinfo , batch -> used_bits ,
2677
+ hashagg_spill_init (& spill , tapeset , batch -> used_bits ,
2697
2678
batch -> input_card , aggstate -> hashentrysize );
2698
2679
}
2699
2680
/* no memory for a new group, spill */
@@ -2709,7 +2690,7 @@ agg_refill_hash_table(AggState *aggstate)
2709
2690
ResetExprContext (aggstate -> tmpcontext );
2710
2691
}
2711
2692
2712
- hashagg_tapeinfo_release ( tapeinfo , batch -> input_tapenum );
2693
+ LogicalTapeClose ( batch -> input_tape );
2713
2694
2714
2695
/* change back to phase 0 */
2715
2696
aggstate -> current_phase = 0 ;
@@ -2884,75 +2865,14 @@ agg_retrieve_hash_table_in_memory(AggState *aggstate)
2884
2865
return NULL ;
2885
2866
}
2886
2867
2887
- /*
2888
- * Initialize HashTapeInfo
2889
- */
2890
- static void
2891
- hashagg_tapeinfo_init (AggState * aggstate )
2892
- {
2893
- HashTapeInfo * tapeinfo = palloc (sizeof (HashTapeInfo ));
2894
- int init_tapes = 16 ; /* expanded dynamically */
2895
-
2896
- tapeinfo -> tapeset = LogicalTapeSetCreate (init_tapes , true, NULL , NULL , -1 );
2897
- tapeinfo -> ntapes = init_tapes ;
2898
- tapeinfo -> nfreetapes = init_tapes ;
2899
- tapeinfo -> freetapes_alloc = init_tapes ;
2900
- tapeinfo -> freetapes = palloc (init_tapes * sizeof (int ));
2901
- for (int i = 0 ; i < init_tapes ; i ++ )
2902
- tapeinfo -> freetapes [i ] = i ;
2903
-
2904
- aggstate -> hash_tapeinfo = tapeinfo ;
2905
- }
2906
-
2907
- /*
2908
- * Assign unused tapes to spill partitions, extending the tape set if
2909
- * necessary.
2910
- */
2911
- static void
2912
- hashagg_tapeinfo_assign (HashTapeInfo * tapeinfo , int * partitions ,
2913
- int npartitions )
2914
- {
2915
- int partidx = 0 ;
2916
-
2917
- /* use free tapes if available */
2918
- while (partidx < npartitions && tapeinfo -> nfreetapes > 0 )
2919
- partitions [partidx ++ ] = tapeinfo -> freetapes [-- tapeinfo -> nfreetapes ];
2920
-
2921
- if (partidx < npartitions )
2922
- {
2923
- LogicalTapeSetExtend (tapeinfo -> tapeset , npartitions - partidx );
2924
-
2925
- while (partidx < npartitions )
2926
- partitions [partidx ++ ] = tapeinfo -> ntapes ++ ;
2927
- }
2928
- }
2929
-
2930
- /*
2931
- * After a tape has already been written to and then read, this function
2932
- * rewinds it for writing and adds it to the free list.
2933
- */
2934
- static void
2935
- hashagg_tapeinfo_release (HashTapeInfo * tapeinfo , int tapenum )
2936
- {
2937
- /* rewinding frees the buffer while not in use */
2938
- LogicalTapeRewindForWrite (tapeinfo -> tapeset , tapenum );
2939
- if (tapeinfo -> freetapes_alloc == tapeinfo -> nfreetapes )
2940
- {
2941
- tapeinfo -> freetapes_alloc <<= 1 ;
2942
- tapeinfo -> freetapes = repalloc (tapeinfo -> freetapes ,
2943
- tapeinfo -> freetapes_alloc * sizeof (int ));
2944
- }
2945
- tapeinfo -> freetapes [tapeinfo -> nfreetapes ++ ] = tapenum ;
2946
- }
2947
-
2948
2868
/*
2949
2869
* hashagg_spill_init
2950
2870
*
2951
2871
* Called after we determined that spilling is necessary. Chooses the number
2952
2872
* of partitions to create, and initializes them.
2953
2873
*/
2954
2874
static void
2955
- hashagg_spill_init (HashAggSpill * spill , HashTapeInfo * tapeinfo , int used_bits ,
2875
+ hashagg_spill_init (HashAggSpill * spill , LogicalTapeSet * tapeset , int used_bits ,
2956
2876
double input_groups , double hashentrysize )
2957
2877
{
2958
2878
int npartitions ;
@@ -2961,13 +2881,13 @@ hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
2961
2881
npartitions = hash_choose_num_partitions (input_groups , hashentrysize ,
2962
2882
used_bits , & partition_bits );
2963
2883
2964
- spill -> partitions = palloc0 (sizeof (int ) * npartitions );
2884
+ spill -> partitions = palloc0 (sizeof (LogicalTape * ) * npartitions );
2965
2885
spill -> ntuples = palloc0 (sizeof (int64 ) * npartitions );
2966
2886
spill -> hll_card = palloc0 (sizeof (hyperLogLogState ) * npartitions );
2967
2887
2968
- hashagg_tapeinfo_assign (tapeinfo , spill -> partitions , npartitions );
2888
+ for (int i = 0 ; i < npartitions ; i ++ )
2889
+ spill -> partitions [i ] = LogicalTapeCreate (tapeset );
2969
2890
2970
- spill -> tapeset = tapeinfo -> tapeset ;
2971
2891
spill -> shift = 32 - used_bits - partition_bits ;
2972
2892
spill -> mask = (npartitions - 1 ) << spill -> shift ;
2973
2893
spill -> npartitions = npartitions ;
@@ -2986,11 +2906,10 @@ static Size
2986
2906
hashagg_spill_tuple (AggState * aggstate , HashAggSpill * spill ,
2987
2907
TupleTableSlot * inputslot , uint32 hash )
2988
2908
{
2989
- LogicalTapeSet * tapeset = spill -> tapeset ;
2990
2909
TupleTableSlot * spillslot ;
2991
2910
int partition ;
2992
2911
MinimalTuple tuple ;
2993
- int tapenum ;
2912
+ LogicalTape * tape ;
2994
2913
int total_written = 0 ;
2995
2914
bool shouldFree ;
2996
2915
@@ -3029,12 +2948,12 @@ hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
3029
2948
*/
3030
2949
addHyperLogLog (& spill -> hll_card [partition ], hash_bytes_uint32 (hash ));
3031
2950
3032
- tapenum = spill -> partitions [partition ];
2951
+ tape = spill -> partitions [partition ];
3033
2952
3034
- LogicalTapeWrite (tapeset , tapenum , (void * ) & hash , sizeof (uint32 ));
2953
+ LogicalTapeWrite (tape , (void * ) & hash , sizeof (uint32 ));
3035
2954
total_written += sizeof (uint32 );
3036
2955
3037
- LogicalTapeWrite (tapeset , tapenum , (void * ) tuple , tuple -> t_len );
2956
+ LogicalTapeWrite (tape , (void * ) tuple , tuple -> t_len );
3038
2957
total_written += tuple -> t_len ;
3039
2958
3040
2959
if (shouldFree )
@@ -3050,15 +2969,14 @@ hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
3050
2969
* be done.
3051
2970
*/
3052
2971
static HashAggBatch *
3053
- hashagg_batch_new (LogicalTapeSet * tapeset , int tapenum , int setno ,
2972
+ hashagg_batch_new (LogicalTape * input_tape , int setno ,
3054
2973
int64 input_tuples , double input_card , int used_bits )
3055
2974
{
3056
2975
HashAggBatch * batch = palloc0 (sizeof (HashAggBatch ));
3057
2976
3058
2977
batch -> setno = setno ;
3059
2978
batch -> used_bits = used_bits ;
3060
- batch -> tapeset = tapeset ;
3061
- batch -> input_tapenum = tapenum ;
2979
+ batch -> input_tape = input_tape ;
3062
2980
batch -> input_tuples = input_tuples ;
3063
2981
batch -> input_card = input_card ;
3064
2982
@@ -3072,42 +2990,41 @@ hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
3072
2990
static MinimalTuple
3073
2991
hashagg_batch_read (HashAggBatch * batch , uint32 * hashp )
3074
2992
{
3075
- LogicalTapeSet * tapeset = batch -> tapeset ;
3076
- int tapenum = batch -> input_tapenum ;
2993
+ LogicalTape * tape = batch -> input_tape ;
3077
2994
MinimalTuple tuple ;
3078
2995
uint32 t_len ;
3079
2996
size_t nread ;
3080
2997
uint32 hash ;
3081
2998
3082
- nread = LogicalTapeRead (tapeset , tapenum , & hash , sizeof (uint32 ));
2999
+ nread = LogicalTapeRead (tape , & hash , sizeof (uint32 ));
3083
3000
if (nread == 0 )
3084
3001
return NULL ;
3085
3002
if (nread != sizeof (uint32 ))
3086
3003
ereport (ERROR ,
3087
3004
(errcode_for_file_access (),
3088
- errmsg ("unexpected EOF for tape %d : requested %zu bytes, read %zu bytes" ,
3089
- tapenum , sizeof (uint32 ), nread )));
3005
+ errmsg ("unexpected EOF for tape %p : requested %zu bytes, read %zu bytes" ,
3006
+ tape , sizeof (uint32 ), nread )));
3090
3007
if (hashp != NULL )
3091
3008
* hashp = hash ;
3092
3009
3093
- nread = LogicalTapeRead (tapeset , tapenum , & t_len , sizeof (t_len ));
3010
+ nread = LogicalTapeRead (tape , & t_len , sizeof (t_len ));
3094
3011
if (nread != sizeof (uint32 ))
3095
3012
ereport (ERROR ,
3096
3013
(errcode_for_file_access (),
3097
- errmsg ("unexpected EOF for tape %d : requested %zu bytes, read %zu bytes" ,
3098
- tapenum , sizeof (uint32 ), nread )));
3014
+ errmsg ("unexpected EOF for tape %p : requested %zu bytes, read %zu bytes" ,
3015
+ tape , sizeof (uint32 ), nread )));
3099
3016
3100
3017
tuple = (MinimalTuple ) palloc (t_len );
3101
3018
tuple -> t_len = t_len ;
3102
3019
3103
- nread = LogicalTapeRead (tapeset , tapenum ,
3020
+ nread = LogicalTapeRead (tape ,
3104
3021
(void * ) ((char * ) tuple + sizeof (uint32 )),
3105
3022
t_len - sizeof (uint32 ));
3106
3023
if (nread != t_len - sizeof (uint32 ))
3107
3024
ereport (ERROR ,
3108
3025
(errcode_for_file_access (),
3109
- errmsg ("unexpected EOF for tape %d : requested %zu bytes, read %zu bytes" ,
3110
- tapenum , t_len - sizeof (uint32 ), nread )));
3026
+ errmsg ("unexpected EOF for tape %p : requested %zu bytes, read %zu bytes" ,
3027
+ tape , t_len - sizeof (uint32 ), nread )));
3111
3028
3112
3029
return tuple ;
3113
3030
}
@@ -3164,8 +3081,7 @@ hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
3164
3081
3165
3082
for (i = 0 ; i < spill -> npartitions ; i ++ )
3166
3083
{
3167
- LogicalTapeSet * tapeset = aggstate -> hash_tapeinfo -> tapeset ;
3168
- int tapenum = spill -> partitions [i ];
3084
+ LogicalTape * tape = spill -> partitions [i ];
3169
3085
HashAggBatch * new_batch ;
3170
3086
double cardinality ;
3171
3087
@@ -3177,10 +3093,9 @@ hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
3177
3093
freeHyperLogLog (& spill -> hll_card [i ]);
3178
3094
3179
3095
/* rewinding frees the buffer while not in use */
3180
- LogicalTapeRewindForRead (tapeset , tapenum ,
3181
- HASHAGG_READ_BUFFER_SIZE );
3096
+ LogicalTapeRewindForRead (tape , HASHAGG_READ_BUFFER_SIZE );
3182
3097
3183
- new_batch = hashagg_batch_new (tapeset , tapenum , setno ,
3098
+ new_batch = hashagg_batch_new (tape , setno ,
3184
3099
spill -> ntuples [i ], cardinality ,
3185
3100
used_bits );
3186
3101
aggstate -> hash_batches = lcons (new_batch , aggstate -> hash_batches );
@@ -3227,14 +3142,10 @@ hashagg_reset_spill_state(AggState *aggstate)
3227
3142
aggstate -> hash_batches = NIL ;
3228
3143
3229
3144
/* close tape set */
3230
- if (aggstate -> hash_tapeinfo != NULL )
3145
+ if (aggstate -> hash_tapeset != NULL )
3231
3146
{
3232
- HashTapeInfo * tapeinfo = aggstate -> hash_tapeinfo ;
3233
-
3234
- LogicalTapeSetClose (tapeinfo -> tapeset );
3235
- pfree (tapeinfo -> freetapes );
3236
- pfree (tapeinfo );
3237
- aggstate -> hash_tapeinfo = NULL ;
3147
+ LogicalTapeSetClose (aggstate -> hash_tapeset );
3148
+ aggstate -> hash_tapeset = NULL ;
3238
3149
}
3239
3150
}
3240
3151
0 commit comments