From dd4f4e3a3a2121d9d159be6697badef7f7a8e99c Mon Sep 17 00:00:00 2001 From: Filip Janus Date: Wed, 16 Apr 2025 12:03:03 +0200 Subject: [PATCH 1/2] This commit adds support for temporary files compression, it can be used only for hashjoins now. It also adds GUC parameter temp_file_compression that enables this functionality. For now, it supports just lz4 and pglz algorithms. In the future, it could also be implemented zstd support. It implements just one working buffer for compression and decompression to avoid memory wasting. The buffer is allocated in the top memory context. --- src/Makefile.global.in | 1 + src/backend/access/gist/gistbuildbuffers.c | 2 +- src/backend/backup/backup_manifest.c | 2 +- src/backend/executor/nodeHashjoin.c | 2 +- src/backend/storage/file/buffile.c | 217 +++++++++++++++++- src/backend/utils/misc/guc_tables.c | 24 ++ src/backend/utils/misc/postgresql.conf.sample | 1 + src/backend/utils/sort/logtape.c | 2 +- src/backend/utils/sort/tuplestore.c | 2 +- src/include/storage/buffile.h | 13 +- 10 files changed, 251 insertions(+), 15 deletions(-) diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 04952b533ded..9a2eb6974e74 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -201,6 +201,7 @@ with_liburing = @with_liburing@ with_libxml = @with_libxml@ with_libxslt = @with_libxslt@ with_llvm = @with_llvm@ +with_lz4 = @with_lz4@ with_system_tzdata = @with_system_tzdata@ with_uuid = @with_uuid@ with_zlib = @with_zlib@ diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c index 0707254d18ea..9cc371f47fe2 100644 --- a/src/backend/access/gist/gistbuildbuffers.c +++ b/src/backend/access/gist/gistbuildbuffers.c @@ -54,7 +54,7 @@ gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) * Create a temporary file to hold buffer pages that are swapped out of * memory. */ - gfbb->pfile = BufFileCreateTemp(false); + gfbb->pfile = BufFileCreateTemp(false, false); gfbb->nFileBlocks = 0; /* Initialize free page management. */ diff --git a/src/backend/backup/backup_manifest.c b/src/backend/backup/backup_manifest.c index 22e2be37c95c..c9f7daa14970 100644 --- a/src/backend/backup/backup_manifest.c +++ b/src/backend/backup/backup_manifest.c @@ -65,7 +65,7 @@ InitializeBackupManifest(backup_manifest_info *manifest, manifest->buffile = NULL; else { - manifest->buffile = BufFileCreateTemp(false); + manifest->buffile = BufFileCreateTemp(false, false); manifest->manifest_ctx = pg_cryptohash_create(PG_SHA256); if (pg_cryptohash_init(manifest->manifest_ctx) < 0) elog(ERROR, "failed to initialize checksum of backup manifest: %s", diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 5661ad768300..384265ca74ae 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -1434,7 +1434,7 @@ ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, { MemoryContext oldctx = MemoryContextSwitchTo(hashtable->spillCxt); - file = BufFileCreateTemp(false); + file = BufFileCreateCompressTemp(false); *fileptr = file; MemoryContextSwitchTo(oldctx); diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 366d70d38a19..103f65503226 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -53,6 +53,18 @@ #include "storage/bufmgr.h" #include "storage/fd.h" #include "utils/resowner.h" +#include "utils/memutils.h" + +#include "common/pg_lzcompress.h" +#ifdef USE_LZ4 +#include +#endif + +#define NO_LZ4_SUPPORT() \ + ereport(ERROR, \ + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \ + errmsg("compression method lz4 not supported"), \ + errdetail("This functionality requires the server to be built with lz4 support."))) /* * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE. @@ -62,6 +74,8 @@ #define MAX_PHYSICAL_FILESIZE 0x40000000 #define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ) +int temp_file_compression = TEMP_NONE_COMPRESSION; + /* * This data structure represents a buffered file that consists of one or * more physical files (each accessed through a virtual file descriptor @@ -95,7 +109,8 @@ struct BufFile off_t curOffset; /* offset part of current pos */ int pos; /* next read/write position in buffer */ int nbytes; /* total # of valid bytes in buffer */ - + bool compress; /* State of usege file compression */ + char *cBuffer; /* * XXX Should ideally use PGIOAlignedBlock, but might need a way to avoid * wasting per-file alignment padding when some users create many files. @@ -127,6 +142,8 @@ makeBufFileCommon(int nfiles) file->curOffset = 0; file->pos = 0; file->nbytes = 0; + file->compress = false; + file->cBuffer = NULL; return file; } @@ -188,9 +205,17 @@ extendBufFile(BufFile *file) * Note: if interXact is true, the caller had better be calling us in a * memory context, and with a resource owner, that will survive across * transaction boundaries. + * + * If compress is true the temporary files will be compressed before + * writing on disk. + * + * Note: The compression does not support random access. Only the hash joins + * use it for now. The seek operation other than seek to the beginning of the + * buffile will corrupt temporary data offsets. + * */ BufFile * -BufFileCreateTemp(bool interXact) +BufFileCreateTemp(bool interXact, bool compress) { BufFile *file; File pfile; @@ -212,9 +237,47 @@ BufFileCreateTemp(bool interXact) file = makeBufFile(pfile); file->isInterXact = interXact; + if (temp_file_compression != TEMP_NONE_COMPRESSION) + { + file->compress = compress; + } + return file; + } +/* + * Wrapper for BuffileCreateTemp + * We want to limit the number of memory allocations for the compression buffer, + * only one buffer for all compression operations is enough + */ +BufFile * +BufFileCreateCompressTemp(bool interXact){ + static char * buff = NULL; + BufFile *tmpBufFile = BufFileCreateTemp(interXact, true); + if (buff == NULL && temp_file_compression != TEMP_NONE_COMPRESSION) + { + int size = 0; + + switch (temp_file_compression) + { + case TEMP_LZ4_COMPRESSION: +#ifdef USE_LZ4 + size = LZ4_compressBound(BLCKSZ)+sizeof(int); +#endif + break; + case TEMP_PGLZ_COMPRESSION: + size = pglz_maximum_compressed_size(BLCKSZ, BLCKSZ)+sizeof(int); + break; + } + /* + * Persistent buffer for all temporary file compressions + */ + buff = MemoryContextAlloc(TopMemoryContext, size); + } + tmpBufFile->cBuffer = buff; + return tmpBufFile; +} /* * Build the name for a given segment of a given BufFile. */ @@ -275,6 +338,7 @@ BufFileCreateFileSet(FileSet *fileset, const char *name) file->files[0] = MakeNewFileSetSegment(file, 0); file->readOnly = false; + return file; } @@ -457,11 +521,75 @@ BufFileLoadBuffer(BufFile *file) /* * Read whatever we can get, up to a full bufferload. */ - file->nbytes = FileRead(thisfile, + if (!file->compress) + { + + /* + * Read whatever we can get, up to a full bufferload. + */ + file->nbytes = FileRead(thisfile, file->buffer.data, - sizeof(file->buffer.data), + sizeof(file->buffer), + file->curOffset, + WAIT_EVENT_BUFFILE_READ); + /* + * Read and decompress data from the temporary file + * The first reading loads size of the compressed block + * Second reading loads compressed data + */ + } else { + int nread; + int nbytes; + + nread = FileRead(thisfile, + &nbytes, + sizeof(nbytes), + file->curOffset, + WAIT_EVENT_BUFFILE_READ); + /* if not EOF let's continue */ + if (nread > 0) + { + /* A long life buffer limits number of memory allocations */ + char * buff = file->cBuffer; + + Assert(file->cBuffer != NULL); + /* + * Read compressed data, curOffset differs with pos + * It reads less data than it returns to caller + * So the curOffset must be advanced here based on compressed size + */ + file->curOffset+=sizeof(nbytes); + + nread = FileRead(thisfile, + buff, + nbytes, file->curOffset, WAIT_EVENT_BUFFILE_READ); + + switch (temp_file_compression) + { + case TEMP_LZ4_COMPRESSION: +#ifdef USE_LZ4 + file->nbytes = LZ4_decompress_safe(buff, + file->buffer.data,nbytes,sizeof(file->buffer)); +#endif + break; + + case TEMP_PGLZ_COMPRESSION: + file->nbytes = pglz_decompress(buff,nbytes, + file->buffer.data,sizeof(file->buffer),false); + break; + } + file->curOffset += nread; + + if (file->nbytes < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg_internal("compressed lz4 data is corrupt"))); + } + + } + if (file->nbytes < 0) { file->nbytes = 0; @@ -494,9 +622,61 @@ static void BufFileDumpBuffer(BufFile *file) { int wpos = 0; - int bytestowrite; + int bytestowrite = 0; File thisfile; + + /* Save nbytes value because the size changes due to compression */ + int nbytesOriginal = file->nbytes; + + char * DataToWrite; + DataToWrite = file->buffer.data; + + /* + * Prepare compressed data to write + * size of compressed block needs to be added at the beggining of the + * compressed data + */ + + + if (file->compress) { + char * cData; + int cSize = 0; + + Assert(file->cBuffer != NULL); + cData = file->cBuffer; + + switch (temp_file_compression) + { + case TEMP_LZ4_COMPRESSION: + { +#ifdef USE_LZ4 + int cBufferSize = LZ4_compressBound(file->nbytes); + /* + * Using stream compression would lead to the slight improvement in + * compression ratio + */ + cSize = LZ4_compress_default(file->buffer.data, + cData + sizeof(int),file->nbytes, cBufferSize); +#endif + break; + } + case TEMP_PGLZ_COMPRESSION: + cSize = pglz_compress(file->buffer.data,file->nbytes, + cData + sizeof(int),PGLZ_strategy_always); + break; + } + + + /* Write size of compressed block in front of compressed data + * It's used to determine amount of data to read within + * decompression process + */ + memcpy(cData,&cSize,sizeof(int)); + file->nbytes=cSize + sizeof(int); + DataToWrite = cData; + } + /* * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it * crosses a component-file boundary; so we need a loop. @@ -535,7 +715,7 @@ BufFileDumpBuffer(BufFile *file) INSTR_TIME_SET_ZERO(io_start); bytestowrite = FileWrite(thisfile, - file->buffer.data + wpos, + DataToWrite + wpos, bytestowrite, file->curOffset, WAIT_EVENT_BUFFILE_WRITE); @@ -564,7 +744,19 @@ BufFileDumpBuffer(BufFile *file) * logical file position, ie, original value + pos, in case that is less * (as could happen due to a small backwards seek in a dirty buffer!) */ - file->curOffset -= (file->nbytes - file->pos); + + + if (!file->compress) + file->curOffset -= (file->nbytes - file->pos); + else + if (nbytesOriginal - file->pos != 0) + /* curOffset must be corrected also if compression is + * enabled, nbytes was changed by compression but we + * have to use the original value of nbytes + */ + file->curOffset-=bytestowrite; + + if (file->curOffset < 0) /* handle possible segment crossing */ { file->curFile--; @@ -577,6 +769,7 @@ BufFileDumpBuffer(BufFile *file) */ file->pos = 0; file->nbytes = 0; + } /* @@ -602,8 +795,14 @@ BufFileReadCommon(BufFile *file, void *ptr, size_t size, bool exact, bool eofOK) { if (file->pos >= file->nbytes) { - /* Try to load more data into buffer. */ - file->curOffset += file->pos; + /* Try to load more data into buffer. + * + * curOffset is moved within BufFileLoadBuffer + * because stored data size differs from loaded/ + * decompressed size + * */ + if (!file->compress) + file->curOffset += file->pos; file->pos = 0; file->nbytes = 0; BufFileLoadBuffer(file); diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 2f8cbd867599..f7fcc7e3d1d7 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -78,6 +78,7 @@ #include "replication/syncrep.h" #include "storage/aio.h" #include "storage/bufmgr.h" +#include "storage/buffile.h" #include "storage/bufpage.h" #include "storage/copydir.h" #include "storage/io_worker.h" @@ -463,6 +464,18 @@ static const struct config_enum_entry default_toast_compression_options[] = { #endif {NULL, 0, false} }; +/* + * pglz and zstd support should be added as future enhancement + * + */ +static const struct config_enum_entry temp_file_compression_options[] = { + {"no", TEMP_NONE_COMPRESSION, false}, + {"pglz", TEMP_PGLZ_COMPRESSION, false}, +#ifdef USE_LZ4 + {"lz4", TEMP_LZ4_COMPRESSION, false}, +#endif + {NULL, 0, false} +}; static const struct config_enum_entry wal_compression_options[] = { {"pglz", WAL_COMPRESSION_PGLZ, false}, @@ -5058,6 +5071,17 @@ struct config_enum ConfigureNamesEnum[] = NULL, NULL, NULL }, + { + {"temp_file_compression", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Sets the default compression method for compressible values."), + NULL + }, + &temp_file_compression, + TEMP_NONE_COMPRESSION, + temp_file_compression_options, + NULL, NULL, NULL + }, + { {"default_transaction_isolation", PGC_USERSET, CLIENT_CONN_STATEMENT, gettext_noop("Sets the transaction isolation level of each new transaction."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 63f991c4f930..51f2f106bfb4 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -180,6 +180,7 @@ #max_notify_queue_pages = 1048576 # limits the number of SLRU pages allocated # for NOTIFY / LISTEN queue +#temp_file_compression = 'no' # enables temporary files compression #file_copy_method = copy # the default is the first option # copy diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index e529ceb8260b..d862e22ef186 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -592,7 +592,7 @@ LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker) lts->pfile = BufFileCreateFileSet(&fileset->fs, filename); } else - lts->pfile = BufFileCreateTemp(false); + lts->pfile = BufFileCreateTemp(false, false); return lts; } diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index c9aecab8d66c..ef85924cd21d 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -860,7 +860,7 @@ tuplestore_puttuple_common(Tuplestorestate *state, void *tuple) */ oldcxt = MemoryContextSwitchTo(state->context->parent); - state->myfile = BufFileCreateTemp(state->interXact); + state->myfile = BufFileCreateTemp(state->interXact, false); MemoryContextSwitchTo(oldcxt); diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index a2f4821f240b..931a211038bd 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -32,11 +32,22 @@ typedef struct BufFile BufFile; +typedef enum +{ + TEMP_NONE_COMPRESSION, + TEMP_PGLZ_COMPRESSION, + TEMP_LZ4_COMPRESSION +} TempCompression; + +extern PGDLLIMPORT int temp_file_compression; + + /* * prototypes for functions in buffile.c */ -extern BufFile *BufFileCreateTemp(bool interXact); +extern BufFile *BufFileCreateCompressTemp(bool interXact); +extern BufFile *BufFileCreateTemp(bool interXact, bool compress); extern void BufFileClose(BufFile *file); pg_nodiscard extern size_t BufFileRead(BufFile *file, void *ptr, size_t size); extern void BufFileReadExact(BufFile *file, void *ptr, size_t size); From e2334c112d5e3bb01b128cbc8ec887f4ca70f859 Mon Sep 17 00:00:00 2001 From: Filip Janus Date: Wed, 16 Apr 2025 12:08:28 +0200 Subject: [PATCH 2/2] Add test for temporary files compression, this commit adds tests for lz4 and pglz. --- src/test/regress/GNUmakefile | 4 + src/test/regress/expected/join_hash_lz4.out | 1166 ++++++++++++++++++ src/test/regress/expected/join_hash_pglz.out | 1166 ++++++++++++++++++ src/test/regress/parallel_schedule | 4 +- src/test/regress/sql/join_hash_lz4.sql | 626 ++++++++++ src/test/regress/sql/join_hash_pglz.sql | 626 ++++++++++ 6 files changed, 3591 insertions(+), 1 deletion(-) create mode 100644 src/test/regress/expected/join_hash_lz4.out create mode 100644 src/test/regress/expected/join_hash_pglz.out create mode 100644 src/test/regress/sql/join_hash_lz4.sql create mode 100644 src/test/regress/sql/join_hash_pglz.sql diff --git a/src/test/regress/GNUmakefile b/src/test/regress/GNUmakefile index ef2bddf42cab..00757a44ca69 100644 --- a/src/test/regress/GNUmakefile +++ b/src/test/regress/GNUmakefile @@ -94,6 +94,10 @@ installdirs-tests: installdirs REGRESS_OPTS = --dlpath=. --max-concurrent-tests=20 \ $(EXTRA_REGRESS_OPTS) +ifeq ($(with_lz4),yes) +override EXTRA_TESTS := join_hash_lz4 $(EXTRA_TESTS) +endif + check: all $(pg_regress_check) $(REGRESS_OPTS) --schedule=$(srcdir)/parallel_schedule $(MAXCONNOPT) $(EXTRA_TESTS) diff --git a/src/test/regress/expected/join_hash_lz4.out b/src/test/regress/expected/join_hash_lz4.out new file mode 100644 index 000000000000..966a5cd8f553 --- /dev/null +++ b/src/test/regress/expected/join_hash_lz4.out @@ -0,0 +1,1166 @@ +-- +-- exercises for the hash join code +-- +begin; +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; +set local enable_hashjoin = on; +set local temp_file_compression = 'lz4'; +-- Extract bucket and batch counts from an explain analyze plan. In +-- general we can't make assertions about how many batches (or +-- buckets) will be required because it can vary, but we can in some +-- special cases and we can check for growth. +create or replace function find_hash(node json) +returns json language plpgsql +as +$$ +declare + x json; + child json; +begin + if node->>'Node Type' = 'Hash' then + return node; + else + for child in select json_array_elements(node->'Plans') + loop + x := find_hash(child); + if x is not null then + return x; + end if; + end loop; + return null; + end if; +end; +$$; +create or replace function hash_join_batches(query text) +returns table (original int, final int) language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + original := hash_node->>'Original Hash Batches'; + final := hash_node->>'Hash Batches'; + return next; + end loop; +end; +$$; +-- Make a simple relation with well distributed keys and correctly +-- estimated size. +create table simple as + select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table simple set (parallel_workers = 2); +analyze simple; +-- Make a relation whose size we will under-estimate. We want stats +-- to say 1000 rows, but actually there are 20,000 rows. +create table bigger_than_it_looks as + select generate_series(1, 20000) as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table bigger_than_it_looks set (autovacuum_enabled = 'false'); +alter table bigger_than_it_looks set (parallel_workers = 2); +analyze bigger_than_it_looks; +update pg_class set reltuples = 1000 where relname = 'bigger_than_it_looks'; +-- Make a relation whose size we underestimate and that also has a +-- kind of skew that breaks our batching scheme. We want stats to say +-- 2 rows, but actually there are 20,000 rows with the same key. +create table extremely_skewed (id int, t text); +alter table extremely_skewed set (autovacuum_enabled = 'false'); +alter table extremely_skewed set (parallel_workers = 2); +analyze extremely_skewed; +insert into extremely_skewed + select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' + from generate_series(1, 20000); +update pg_class + set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 + where relname = 'extremely_skewed'; +-- Make a relation with a couple of enormous tuples. +create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +alter table wide set (parallel_workers = 2); +-- The "optimal" case: the hash table fits in memory; we plan for 1 +-- batch, we stick to that number, and peak memory usage stays within +-- our work_mem budget +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(6 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | f +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(9 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | f +(1 row) + +rollback to settings; +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on simple s +(9 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | f +(1 row) + +rollback to settings; +-- The "good" case: batches required, but we plan the right number; we +-- plan for some number of batches, and we stick to that number, and +-- peak memory usage says within our work_mem budget +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(6 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + t | f +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(9 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + t | f +(1 row) + +rollback to settings; +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '192kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on simple s +(9 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + t | f +(1 row) + +-- parallel full multi-batch hash join +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- The "bad" case: during execution we need to increase number of +-- batches; in this case we plan for 1 batch, and increase at least a +-- couple of times, and peak memory usage stays within our work_mem +-- budget +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); + QUERY PLAN +------------------------------------------------------ + Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on bigger_than_it_looks s +(6 rows) + +select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | t +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); + QUERY PLAN +------------------------------------------------------------------ + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Hash + -> Seq Scan on bigger_than_it_looks s +(9 rows) + +select count(*) from simple r join bigger_than_it_looks s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | t +(1 row) + +rollback to settings; +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local work_mem = '192kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); + QUERY PLAN +--------------------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 1 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on bigger_than_it_looks s +(9 rows) + +select count(*) from simple r join bigger_than_it_looks s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | t +(1 row) + +rollback to settings; +-- The "ugly" case: increasing the number of batches during execution +-- doesn't help, so stop trying to fit in work_mem and hope for the +-- best; in this case we plan for 1 batch, increases just once and +-- then stop increasing because that didn't help at all, so we blow +-- right through the work_mem budget and hope for the best... +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on extremely_skewed s +(6 rows) + +select count(*) from simple r join extremely_skewed s using (id); + count +------- + 20000 +(1 row) + +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); + original | final +----------+------- + 1 | 2 +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); + QUERY PLAN +-------------------------------------------------------- + Aggregate + -> Gather + Workers Planned: 2 + -> Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Hash + -> Seq Scan on extremely_skewed s +(8 rows) + +select count(*) from simple r join extremely_skewed s using (id); + count +------- + 20000 +(1 row) + +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); + original | final +----------+------- + 1 | 2 +(1 row) + +rollback to settings; +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); + QUERY PLAN +----------------------------------------------------------------- + Aggregate + -> Gather + Workers Planned: 1 + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on extremely_skewed s +(8 rows) + +select count(*) from simple r join extremely_skewed s using (id); + count +------- + 20000 +(1 row) + +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); + original | final +----------+------- + 1 | 4 +(1 row) + +rollback to settings; +-- A couple of other hash join tests unrelated to work_mem management. +-- Check that EXPLAIN ANALYZE has data even if the leader doesn't participate +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local parallel_leader_participation = off; +select * from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + original | final +----------+------- + 1 | 1 +(1 row) + +rollback to settings; +-- Exercise rescans. We'll turn off parallel_leader_participation so +-- that we can check that instrumentation comes back correctly. +create table join_foo as select generate_series(1, 3) as id, 'xxxxx'::text as t; +alter table join_foo set (parallel_workers = 0); +create table join_bar as select generate_series(1, 10000) as id, 'xxxxx'::text as t; +alter table join_bar set (parallel_workers = 2); +-- multi-batch with rescan, parallel-oblivious +savepoint settings; +set enable_parallel_hash = off; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '64kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: ((join_foo.id < (b1.id + 1)) AND (join_foo.id > (b1.id - 1))) + -> Seq Scan on join_foo + -> Gather + Workers Planned: 2 + -> Hash Join + Hash Cond: (b1.id = b2.id) + -> Parallel Seq Scan on join_bar b1 + -> Hash + -> Seq Scan on join_bar b2 +(11 rows) + +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + count +------- + 3 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); + multibatch +------------ + t +(1 row) + +rollback to settings; +-- single-batch with rescan, parallel-oblivious +savepoint settings; +set enable_parallel_hash = off; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '4MB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: ((join_foo.id < (b1.id + 1)) AND (join_foo.id > (b1.id - 1))) + -> Seq Scan on join_foo + -> Gather + Workers Planned: 2 + -> Hash Join + Hash Cond: (b1.id = b2.id) + -> Parallel Seq Scan on join_bar b1 + -> Hash + -> Seq Scan on join_bar b2 +(11 rows) + +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + count +------- + 3 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); + multibatch +------------ + f +(1 row) + +rollback to settings; +-- multi-batch with rescan, parallel-aware +savepoint settings; +set enable_parallel_hash = on; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '64kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: ((join_foo.id < (b1.id + 1)) AND (join_foo.id > (b1.id - 1))) + -> Seq Scan on join_foo + -> Gather + Workers Planned: 2 + -> Parallel Hash Join + Hash Cond: (b1.id = b2.id) + -> Parallel Seq Scan on join_bar b1 + -> Parallel Hash + -> Parallel Seq Scan on join_bar b2 +(11 rows) + +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + count +------- + 3 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); + multibatch +------------ + t +(1 row) + +rollback to settings; +-- single-batch with rescan, parallel-aware +savepoint settings; +set enable_parallel_hash = on; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '4MB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: ((join_foo.id < (b1.id + 1)) AND (join_foo.id > (b1.id - 1))) + -> Seq Scan on join_foo + -> Gather + Workers Planned: 2 + -> Parallel Hash Join + Hash Cond: (b1.id = b2.id) + -> Parallel Seq Scan on join_bar b1 + -> Parallel Hash + -> Parallel Seq Scan on join_bar b2 +(11 rows) + +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + count +------- + 3 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); + multibatch +------------ + f +(1 row) + +rollback to settings; +-- A full outer join where every record is matched. +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Full Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(6 rows) + +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- parallelism not possible with parallel-oblivious full hash join +savepoint settings; +set enable_parallel_hash = off; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Full Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(6 rows) + +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- parallelism is possible with parallel-aware full hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); + QUERY PLAN +------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Full Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on simple s +(9 rows) + +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- A full outer join where every record is not matched. +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Full Join + Hash Cond: ((0 - s.id) = r.id) + -> Seq Scan on simple s + -> Hash + -> Seq Scan on simple r +(6 rows) + +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + count +------- + 40000 +(1 row) + +rollback to settings; +-- parallelism not possible with parallel-oblivious full hash join +savepoint settings; +set enable_parallel_hash = off; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Full Join + Hash Cond: ((0 - s.id) = r.id) + -> Seq Scan on simple s + -> Hash + -> Seq Scan on simple r +(6 rows) + +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + count +------- + 40000 +(1 row) + +rollback to settings; +-- parallelism is possible with parallel-aware full hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + QUERY PLAN +------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Full Join + Hash Cond: ((0 - s.id) = r.id) + -> Parallel Seq Scan on simple s + -> Parallel Hash + -> Parallel Seq Scan on simple r +(9 rows) + +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + count +------- + 40000 +(1 row) + +rollback to settings; +-- exercise special code paths for huge tuples (note use of non-strict +-- expression and left join required to get the detoasted tuple into +-- the hash table) +-- parallel with parallel-aware hash join (hits ExecParallelHashLoadTuple and +-- sts_puttuple oversized tuple cases because it's multi-batch) +savepoint settings; +set max_parallel_workers_per_gather = 2; +set enable_parallel_hash = on; +set work_mem = '128kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select length(max(s.t)) + from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + QUERY PLAN +---------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Left Join + Hash Cond: (wide.id = wide_1.id) + -> Parallel Seq Scan on wide + -> Parallel Hash + -> Parallel Seq Scan on wide wide_1 +(9 rows) + +select length(max(s.t)) +from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + length +-------- + 320000 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select length(max(s.t)) + from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +$$); + multibatch +------------ + t +(1 row) + +rollback to settings; +-- Hash join reuses the HOT status bit to indicate match status. This can only +-- be guaranteed to produce correct results if all the hash join tuple match +-- bits are reset before reuse. This is done upon loading them into the +-- hashtable. +SAVEPOINT settings; +SET enable_parallel_hash = on; +SET min_parallel_table_scan_size = 0; +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +CREATE TABLE hjtest_matchbits_t1(id int); +CREATE TABLE hjtest_matchbits_t2(id int); +INSERT INTO hjtest_matchbits_t1 VALUES (1); +INSERT INTO hjtest_matchbits_t2 VALUES (2); +-- Update should create a HOT tuple. If this status bit isn't cleared, we won't +-- correctly emit the NULL-extended unmatching tuple in full hash join. +UPDATE hjtest_matchbits_t2 set id = 2; +SELECT * FROM hjtest_matchbits_t1 t1 FULL JOIN hjtest_matchbits_t2 t2 ON t1.id = t2.id + ORDER BY t1.id; + id | id +----+---- + 1 | + | 2 +(2 rows) + +-- Test serial full hash join. +-- Resetting parallel_setup_cost should force a serial plan. +-- Just to be safe, however, set enable_parallel_hash to off, as parallel full +-- hash joins are only supported with shared hashtables. +RESET parallel_setup_cost; +SET enable_parallel_hash = off; +SELECT * FROM hjtest_matchbits_t1 t1 FULL JOIN hjtest_matchbits_t2 t2 ON t1.id = t2.id; + id | id +----+---- + 1 | + | 2 +(2 rows) + +ROLLBACK TO settings; +rollback; +-- Verify that hash key expressions reference the correct +-- nodes. Hashjoin's hashkeys need to reference its outer plan, Hash's +-- need to reference Hash's outer plan (which is below HashJoin's +-- inner plan). It's not trivial to verify that the references are +-- correct (we don't display the hashkeys themselves), but if the +-- hashkeys contain subplan references, those will be displayed. Force +-- subplans to appear just about everywhere. +-- +-- Bug report: +-- https://www.postgresql.org/message-id/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR%2BteQ_8tEXU8mxg%40mail.gmail.com +-- +BEGIN; +SET LOCAL enable_sort = OFF; -- avoid mergejoins +SET LOCAL from_collapse_limit = 1; -- allows easy changing of join order +CREATE TABLE hjtest_1 (a text, b int, id int, c bool); +CREATE TABLE hjtest_2 (a bool, id int, b text, c int); +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 2, 1, false); -- matches +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 1, 2, false); -- fails id join condition +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 20, 1, false); -- fails < 50 +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 1, 1, false); -- fails (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 2); -- matches +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 3, 'another', 7); -- fails id join condition +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 90); -- fails < 55 +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 3); -- fails (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'text', 1); -- fails hjtest_1.a <> hjtest_2.b; +EXPLAIN (COSTS OFF, VERBOSE) +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_1, hjtest_2 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + QUERY PLAN +------------------------------------------------------------------------------------------------ + Hash Join + Output: hjtest_1.a, hjtest_2.a, (hjtest_1.tableoid)::regclass, (hjtest_2.tableoid)::regclass + Hash Cond: ((hjtest_1.id = (SubPlan 1)) AND ((SubPlan 2) = (SubPlan 3))) + Join Filter: (hjtest_1.a <> hjtest_2.b) + -> Seq Scan on public.hjtest_1 + Output: hjtest_1.a, hjtest_1.tableoid, hjtest_1.id, hjtest_1.b + Filter: ((SubPlan 4) < 50) + SubPlan 4 + -> Result + Output: (hjtest_1.b * 5) + -> Hash + Output: hjtest_2.a, hjtest_2.tableoid, hjtest_2.id, hjtest_2.c, hjtest_2.b + -> Seq Scan on public.hjtest_2 + Output: hjtest_2.a, hjtest_2.tableoid, hjtest_2.id, hjtest_2.c, hjtest_2.b + Filter: ((SubPlan 5) < 55) + SubPlan 5 + -> Result + Output: (hjtest_2.c * 5) + SubPlan 1 + -> Result + Output: 1 + One-Time Filter: (hjtest_2.id = 1) + SubPlan 3 + -> Result + Output: (hjtest_2.c * 5) + SubPlan 2 + -> Result + Output: (hjtest_1.b * 5) +(28 rows) + +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_1, hjtest_2 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + a1 | a2 | t1 | t2 +------+----+----------+---------- + text | t | hjtest_1 | hjtest_2 +(1 row) + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_2, hjtest_1 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + QUERY PLAN +------------------------------------------------------------------------------------------------ + Hash Join + Output: hjtest_1.a, hjtest_2.a, (hjtest_1.tableoid)::regclass, (hjtest_2.tableoid)::regclass + Hash Cond: (((SubPlan 1) = hjtest_1.id) AND ((SubPlan 3) = (SubPlan 2))) + Join Filter: (hjtest_1.a <> hjtest_2.b) + -> Seq Scan on public.hjtest_2 + Output: hjtest_2.a, hjtest_2.tableoid, hjtest_2.id, hjtest_2.c, hjtest_2.b + Filter: ((SubPlan 5) < 55) + SubPlan 5 + -> Result + Output: (hjtest_2.c * 5) + -> Hash + Output: hjtest_1.a, hjtest_1.tableoid, hjtest_1.id, hjtest_1.b + -> Seq Scan on public.hjtest_1 + Output: hjtest_1.a, hjtest_1.tableoid, hjtest_1.id, hjtest_1.b + Filter: ((SubPlan 4) < 50) + SubPlan 4 + -> Result + Output: (hjtest_1.b * 5) + SubPlan 2 + -> Result + Output: (hjtest_1.b * 5) + SubPlan 1 + -> Result + Output: 1 + One-Time Filter: (hjtest_2.id = 1) + SubPlan 3 + -> Result + Output: (hjtest_2.c * 5) +(28 rows) + +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_2, hjtest_1 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + a1 | a2 | t1 | t2 +------+----+----------+---------- + text | t | hjtest_1 | hjtest_2 +(1 row) + +ROLLBACK; +-- Verify that we behave sanely when the inner hash keys contain parameters +-- (that is, outer or lateral references). This situation has to defeat +-- re-use of the inner hash table across rescans. +begin; +set local enable_hashjoin = on; +explain (costs off) +select i8.q2, ss.* from +int8_tbl i8, +lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 + on t1.fivethous = i4.f1+i8.q2 order by 1,2) ss; + QUERY PLAN +----------------------------------------------------------- + Nested Loop + -> Seq Scan on int8_tbl i8 + -> Sort + Sort Key: t1.fivethous, i4.f1 + -> Hash Join + Hash Cond: (t1.fivethous = (i4.f1 + i8.q2)) + -> Seq Scan on tenk1 t1 + -> Hash + -> Seq Scan on int4_tbl i4 +(9 rows) + +select i8.q2, ss.* from +int8_tbl i8, +lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 + on t1.fivethous = i4.f1+i8.q2 order by 1,2) ss; + q2 | fivethous | f1 +-----+-----------+---- + 456 | 456 | 0 + 456 | 456 | 0 + 123 | 123 | 0 + 123 | 123 | 0 +(4 rows) + +rollback; diff --git a/src/test/regress/expected/join_hash_pglz.out b/src/test/regress/expected/join_hash_pglz.out new file mode 100644 index 000000000000..99c67f982af9 --- /dev/null +++ b/src/test/regress/expected/join_hash_pglz.out @@ -0,0 +1,1166 @@ +-- +-- exercises for the hash join code +-- +begin; +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; +set local enable_hashjoin = on; +set local temp_file_compression = 'pglz'; +-- Extract bucket and batch counts from an explain analyze plan. In +-- general we can't make assertions about how many batches (or +-- buckets) will be required because it can vary, but we can in some +-- special cases and we can check for growth. +create or replace function find_hash(node json) +returns json language plpgsql +as +$$ +declare + x json; + child json; +begin + if node->>'Node Type' = 'Hash' then + return node; + else + for child in select json_array_elements(node->'Plans') + loop + x := find_hash(child); + if x is not null then + return x; + end if; + end loop; + return null; + end if; +end; +$$; +create or replace function hash_join_batches(query text) +returns table (original int, final int) language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + original := hash_node->>'Original Hash Batches'; + final := hash_node->>'Hash Batches'; + return next; + end loop; +end; +$$; +-- Make a simple relation with well distributed keys and correctly +-- estimated size. +create table simple as + select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table simple set (parallel_workers = 2); +analyze simple; +-- Make a relation whose size we will under-estimate. We want stats +-- to say 1000 rows, but actually there are 20,000 rows. +create table bigger_than_it_looks as + select generate_series(1, 20000) as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table bigger_than_it_looks set (autovacuum_enabled = 'false'); +alter table bigger_than_it_looks set (parallel_workers = 2); +analyze bigger_than_it_looks; +update pg_class set reltuples = 1000 where relname = 'bigger_than_it_looks'; +-- Make a relation whose size we underestimate and that also has a +-- kind of skew that breaks our batching scheme. We want stats to say +-- 2 rows, but actually there are 20,000 rows with the same key. +create table extremely_skewed (id int, t text); +alter table extremely_skewed set (autovacuum_enabled = 'false'); +alter table extremely_skewed set (parallel_workers = 2); +analyze extremely_skewed; +insert into extremely_skewed + select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' + from generate_series(1, 20000); +update pg_class + set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 + where relname = 'extremely_skewed'; +-- Make a relation with a couple of enormous tuples. +create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +alter table wide set (parallel_workers = 2); +-- The "optimal" case: the hash table fits in memory; we plan for 1 +-- batch, we stick to that number, and peak memory usage stays within +-- our work_mem budget +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(6 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | f +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(9 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | f +(1 row) + +rollback to settings; +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on simple s +(9 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | f +(1 row) + +rollback to settings; +-- The "good" case: batches required, but we plan the right number; we +-- plan for some number of batches, and we stick to that number, and +-- peak memory usage says within our work_mem budget +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(6 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + t | f +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(9 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + t | f +(1 row) + +rollback to settings; +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '192kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join simple s using (id); + QUERY PLAN +------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on simple s +(9 rows) + +select count(*) from simple r join simple s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + t | f +(1 row) + +-- parallel full multi-batch hash join +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- The "bad" case: during execution we need to increase number of +-- batches; in this case we plan for 1 batch, and increase at least a +-- couple of times, and peak memory usage stays within our work_mem +-- budget +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); + QUERY PLAN +------------------------------------------------------ + Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on bigger_than_it_looks s +(6 rows) + +select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | t +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); + QUERY PLAN +------------------------------------------------------------------ + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Hash + -> Seq Scan on bigger_than_it_looks s +(9 rows) + +select count(*) from simple r join bigger_than_it_looks s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | t +(1 row) + +rollback to settings; +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local work_mem = '192kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); + QUERY PLAN +--------------------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 1 + -> Partial Aggregate + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on bigger_than_it_looks s +(9 rows) + +select count(*) from simple r join bigger_than_it_looks s using (id); + count +------- + 20000 +(1 row) + +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); + initially_multibatch | increased_batches +----------------------+------------------- + f | t +(1 row) + +rollback to settings; +-- The "ugly" case: increasing the number of batches during execution +-- doesn't help, so stop trying to fit in work_mem and hope for the +-- best; in this case we plan for 1 batch, increases just once and +-- then stop increasing because that didn't help at all, so we blow +-- right through the work_mem budget and hope for the best... +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); + QUERY PLAN +-------------------------------------------------- + Aggregate + -> Hash Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on extremely_skewed s +(6 rows) + +select count(*) from simple r join extremely_skewed s using (id); + count +------- + 20000 +(1 row) + +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); + original | final +----------+------- + 1 | 2 +(1 row) + +rollback to settings; +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); + QUERY PLAN +-------------------------------------------------------- + Aggregate + -> Gather + Workers Planned: 2 + -> Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Hash + -> Seq Scan on extremely_skewed s +(8 rows) + +select count(*) from simple r join extremely_skewed s using (id); + count +------- + 20000 +(1 row) + +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); + original | final +----------+------- + 1 | 2 +(1 row) + +rollback to settings; +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); + QUERY PLAN +----------------------------------------------------------------- + Aggregate + -> Gather + Workers Planned: 1 + -> Parallel Hash Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on extremely_skewed s +(8 rows) + +select count(*) from simple r join extremely_skewed s using (id); + count +------- + 20000 +(1 row) + +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); + original | final +----------+------- + 1 | 4 +(1 row) + +rollback to settings; +-- A couple of other hash join tests unrelated to work_mem management. +-- Check that EXPLAIN ANALYZE has data even if the leader doesn't participate +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local parallel_leader_participation = off; +select * from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); + original | final +----------+------- + 1 | 1 +(1 row) + +rollback to settings; +-- Exercise rescans. We'll turn off parallel_leader_participation so +-- that we can check that instrumentation comes back correctly. +create table join_foo as select generate_series(1, 3) as id, 'xxxxx'::text as t; +alter table join_foo set (parallel_workers = 0); +create table join_bar as select generate_series(1, 10000) as id, 'xxxxx'::text as t; +alter table join_bar set (parallel_workers = 2); +-- multi-batch with rescan, parallel-oblivious +savepoint settings; +set enable_parallel_hash = off; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '64kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: ((join_foo.id < (b1.id + 1)) AND (join_foo.id > (b1.id - 1))) + -> Seq Scan on join_foo + -> Gather + Workers Planned: 2 + -> Hash Join + Hash Cond: (b1.id = b2.id) + -> Parallel Seq Scan on join_bar b1 + -> Hash + -> Seq Scan on join_bar b2 +(11 rows) + +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + count +------- + 3 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); + multibatch +------------ + t +(1 row) + +rollback to settings; +-- single-batch with rescan, parallel-oblivious +savepoint settings; +set enable_parallel_hash = off; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '4MB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: ((join_foo.id < (b1.id + 1)) AND (join_foo.id > (b1.id - 1))) + -> Seq Scan on join_foo + -> Gather + Workers Planned: 2 + -> Hash Join + Hash Cond: (b1.id = b2.id) + -> Parallel Seq Scan on join_bar b1 + -> Hash + -> Seq Scan on join_bar b2 +(11 rows) + +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + count +------- + 3 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); + multibatch +------------ + f +(1 row) + +rollback to settings; +-- multi-batch with rescan, parallel-aware +savepoint settings; +set enable_parallel_hash = on; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '64kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: ((join_foo.id < (b1.id + 1)) AND (join_foo.id > (b1.id - 1))) + -> Seq Scan on join_foo + -> Gather + Workers Planned: 2 + -> Parallel Hash Join + Hash Cond: (b1.id = b2.id) + -> Parallel Seq Scan on join_bar b1 + -> Parallel Hash + -> Parallel Seq Scan on join_bar b2 +(11 rows) + +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + count +------- + 3 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); + multibatch +------------ + t +(1 row) + +rollback to settings; +-- single-batch with rescan, parallel-aware +savepoint settings; +set enable_parallel_hash = on; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '4MB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + QUERY PLAN +------------------------------------------------------------------------------------ + Aggregate + -> Nested Loop Left Join + Join Filter: ((join_foo.id < (b1.id + 1)) AND (join_foo.id > (b1.id - 1))) + -> Seq Scan on join_foo + -> Gather + Workers Planned: 2 + -> Parallel Hash Join + Hash Cond: (b1.id = b2.id) + -> Parallel Seq Scan on join_bar b1 + -> Parallel Hash + -> Parallel Seq Scan on join_bar b2 +(11 rows) + +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; + count +------- + 3 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); + multibatch +------------ + f +(1 row) + +rollback to settings; +-- A full outer join where every record is matched. +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Full Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(6 rows) + +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- parallelism not possible with parallel-oblivious full hash join +savepoint settings; +set enable_parallel_hash = off; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Full Join + Hash Cond: (r.id = s.id) + -> Seq Scan on simple r + -> Hash + -> Seq Scan on simple s +(6 rows) + +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- parallelism is possible with parallel-aware full hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); + QUERY PLAN +------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Full Join + Hash Cond: (r.id = s.id) + -> Parallel Seq Scan on simple r + -> Parallel Hash + -> Parallel Seq Scan on simple s +(9 rows) + +select count(*) from simple r full outer join simple s using (id); + count +------- + 20000 +(1 row) + +rollback to settings; +-- A full outer join where every record is not matched. +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Full Join + Hash Cond: ((0 - s.id) = r.id) + -> Seq Scan on simple s + -> Hash + -> Seq Scan on simple r +(6 rows) + +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + count +------- + 40000 +(1 row) + +rollback to settings; +-- parallelism not possible with parallel-oblivious full hash join +savepoint settings; +set enable_parallel_hash = off; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + QUERY PLAN +---------------------------------------- + Aggregate + -> Hash Full Join + Hash Cond: ((0 - s.id) = r.id) + -> Seq Scan on simple s + -> Hash + -> Seq Scan on simple r +(6 rows) + +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + count +------- + 40000 +(1 row) + +rollback to settings; +-- parallelism is possible with parallel-aware full hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + QUERY PLAN +------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Full Join + Hash Cond: ((0 - s.id) = r.id) + -> Parallel Seq Scan on simple s + -> Parallel Hash + -> Parallel Seq Scan on simple r +(9 rows) + +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); + count +------- + 40000 +(1 row) + +rollback to settings; +-- exercise special code paths for huge tuples (note use of non-strict +-- expression and left join required to get the detoasted tuple into +-- the hash table) +-- parallel with parallel-aware hash join (hits ExecParallelHashLoadTuple and +-- sts_puttuple oversized tuple cases because it's multi-batch) +savepoint settings; +set max_parallel_workers_per_gather = 2; +set enable_parallel_hash = on; +set work_mem = '128kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select length(max(s.t)) + from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + QUERY PLAN +---------------------------------------------------------------- + Finalize Aggregate + -> Gather + Workers Planned: 2 + -> Partial Aggregate + -> Parallel Hash Left Join + Hash Cond: (wide.id = wide_1.id) + -> Parallel Seq Scan on wide + -> Parallel Hash + -> Parallel Seq Scan on wide wide_1 +(9 rows) + +select length(max(s.t)) +from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); + length +-------- + 320000 +(1 row) + +select final > 1 as multibatch + from hash_join_batches( +$$ + select length(max(s.t)) + from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +$$); + multibatch +------------ + t +(1 row) + +rollback to settings; +-- Hash join reuses the HOT status bit to indicate match status. This can only +-- be guaranteed to produce correct results if all the hash join tuple match +-- bits are reset before reuse. This is done upon loading them into the +-- hashtable. +SAVEPOINT settings; +SET enable_parallel_hash = on; +SET min_parallel_table_scan_size = 0; +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +CREATE TABLE hjtest_matchbits_t1(id int); +CREATE TABLE hjtest_matchbits_t2(id int); +INSERT INTO hjtest_matchbits_t1 VALUES (1); +INSERT INTO hjtest_matchbits_t2 VALUES (2); +-- Update should create a HOT tuple. If this status bit isn't cleared, we won't +-- correctly emit the NULL-extended unmatching tuple in full hash join. +UPDATE hjtest_matchbits_t2 set id = 2; +SELECT * FROM hjtest_matchbits_t1 t1 FULL JOIN hjtest_matchbits_t2 t2 ON t1.id = t2.id + ORDER BY t1.id; + id | id +----+---- + 1 | + | 2 +(2 rows) + +-- Test serial full hash join. +-- Resetting parallel_setup_cost should force a serial plan. +-- Just to be safe, however, set enable_parallel_hash to off, as parallel full +-- hash joins are only supported with shared hashtables. +RESET parallel_setup_cost; +SET enable_parallel_hash = off; +SELECT * FROM hjtest_matchbits_t1 t1 FULL JOIN hjtest_matchbits_t2 t2 ON t1.id = t2.id; + id | id +----+---- + 1 | + | 2 +(2 rows) + +ROLLBACK TO settings; +rollback; +-- Verify that hash key expressions reference the correct +-- nodes. Hashjoin's hashkeys need to reference its outer plan, Hash's +-- need to reference Hash's outer plan (which is below HashJoin's +-- inner plan). It's not trivial to verify that the references are +-- correct (we don't display the hashkeys themselves), but if the +-- hashkeys contain subplan references, those will be displayed. Force +-- subplans to appear just about everywhere. +-- +-- Bug report: +-- https://www.postgresql.org/message-id/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR%2BteQ_8tEXU8mxg%40mail.gmail.com +-- +BEGIN; +SET LOCAL enable_sort = OFF; -- avoid mergejoins +SET LOCAL from_collapse_limit = 1; -- allows easy changing of join order +CREATE TABLE hjtest_1 (a text, b int, id int, c bool); +CREATE TABLE hjtest_2 (a bool, id int, b text, c int); +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 2, 1, false); -- matches +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 1, 2, false); -- fails id join condition +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 20, 1, false); -- fails < 50 +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 1, 1, false); -- fails (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 2); -- matches +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 3, 'another', 7); -- fails id join condition +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 90); -- fails < 55 +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 3); -- fails (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'text', 1); -- fails hjtest_1.a <> hjtest_2.b; +EXPLAIN (COSTS OFF, VERBOSE) +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_1, hjtest_2 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + QUERY PLAN +------------------------------------------------------------------------------------------------ + Hash Join + Output: hjtest_1.a, hjtest_2.a, (hjtest_1.tableoid)::regclass, (hjtest_2.tableoid)::regclass + Hash Cond: ((hjtest_1.id = (SubPlan 1)) AND ((SubPlan 2) = (SubPlan 3))) + Join Filter: (hjtest_1.a <> hjtest_2.b) + -> Seq Scan on public.hjtest_1 + Output: hjtest_1.a, hjtest_1.tableoid, hjtest_1.id, hjtest_1.b + Filter: ((SubPlan 4) < 50) + SubPlan 4 + -> Result + Output: (hjtest_1.b * 5) + -> Hash + Output: hjtest_2.a, hjtest_2.tableoid, hjtest_2.id, hjtest_2.c, hjtest_2.b + -> Seq Scan on public.hjtest_2 + Output: hjtest_2.a, hjtest_2.tableoid, hjtest_2.id, hjtest_2.c, hjtest_2.b + Filter: ((SubPlan 5) < 55) + SubPlan 5 + -> Result + Output: (hjtest_2.c * 5) + SubPlan 1 + -> Result + Output: 1 + One-Time Filter: (hjtest_2.id = 1) + SubPlan 3 + -> Result + Output: (hjtest_2.c * 5) + SubPlan 2 + -> Result + Output: (hjtest_1.b * 5) +(28 rows) + +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_1, hjtest_2 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + a1 | a2 | t1 | t2 +------+----+----------+---------- + text | t | hjtest_1 | hjtest_2 +(1 row) + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_2, hjtest_1 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + QUERY PLAN +------------------------------------------------------------------------------------------------ + Hash Join + Output: hjtest_1.a, hjtest_2.a, (hjtest_1.tableoid)::regclass, (hjtest_2.tableoid)::regclass + Hash Cond: (((SubPlan 1) = hjtest_1.id) AND ((SubPlan 3) = (SubPlan 2))) + Join Filter: (hjtest_1.a <> hjtest_2.b) + -> Seq Scan on public.hjtest_2 + Output: hjtest_2.a, hjtest_2.tableoid, hjtest_2.id, hjtest_2.c, hjtest_2.b + Filter: ((SubPlan 5) < 55) + SubPlan 5 + -> Result + Output: (hjtest_2.c * 5) + -> Hash + Output: hjtest_1.a, hjtest_1.tableoid, hjtest_1.id, hjtest_1.b + -> Seq Scan on public.hjtest_1 + Output: hjtest_1.a, hjtest_1.tableoid, hjtest_1.id, hjtest_1.b + Filter: ((SubPlan 4) < 50) + SubPlan 4 + -> Result + Output: (hjtest_1.b * 5) + SubPlan 2 + -> Result + Output: (hjtest_1.b * 5) + SubPlan 1 + -> Result + Output: 1 + One-Time Filter: (hjtest_2.id = 1) + SubPlan 3 + -> Result + Output: (hjtest_2.c * 5) +(28 rows) + +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_2, hjtest_1 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + a1 | a2 | t1 | t2 +------+----+----------+---------- + text | t | hjtest_1 | hjtest_2 +(1 row) + +ROLLBACK; +-- Verify that we behave sanely when the inner hash keys contain parameters +-- (that is, outer or lateral references). This situation has to defeat +-- re-use of the inner hash table across rescans. +begin; +set local enable_hashjoin = on; +explain (costs off) +select i8.q2, ss.* from +int8_tbl i8, +lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 + on t1.fivethous = i4.f1+i8.q2 order by 1,2) ss; + QUERY PLAN +----------------------------------------------------------- + Nested Loop + -> Seq Scan on int8_tbl i8 + -> Sort + Sort Key: t1.fivethous, i4.f1 + -> Hash Join + Hash Cond: (t1.fivethous = (i4.f1 + i8.q2)) + -> Seq Scan on tenk1 t1 + -> Hash + -> Seq Scan on int4_tbl i4 +(9 rows) + +select i8.q2, ss.* from +int8_tbl i8, +lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 + on t1.fivethous = i4.f1+i8.q2 order by 1,2) ss; + q2 | fivethous | f1 +-----+-----------+---- + 456 | 456 | 0 + 456 | 456 | 0 + 123 | 123 | 0 + 123 | 123 | 0 +(4 rows) + +rollback; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index a424be2a6bf0..a7d280bf10b5 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -15,7 +15,6 @@ test: test_setup # The first group of parallel tests # ---------- test: boolean char name varchar text int2 int4 int8 oid float4 float8 bit numeric txid uuid enum money rangetypes pg_lsn regproc - # ---------- # The second group of parallel tests # multirangetypes depends on rangetypes @@ -140,3 +139,6 @@ test: fast_default # run tablespace test at the end because it drops the tablespace created during # setup that other tests may use. test: tablespace + +# this test is equivalent to join_hash test just the compression is enabled +test: join_hash_pglz diff --git a/src/test/regress/sql/join_hash_lz4.sql b/src/test/regress/sql/join_hash_lz4.sql new file mode 100644 index 000000000000..1d19c1980e1d --- /dev/null +++ b/src/test/regress/sql/join_hash_lz4.sql @@ -0,0 +1,626 @@ +-- +-- exercises for the hash join code +-- + +begin; + +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; +set local enable_hashjoin = on; +set local temp_file_compression = 'lz4'; + +-- Extract bucket and batch counts from an explain analyze plan. In +-- general we can't make assertions about how many batches (or +-- buckets) will be required because it can vary, but we can in some +-- special cases and we can check for growth. +create or replace function find_hash(node json) +returns json language plpgsql +as +$$ +declare + x json; + child json; +begin + if node->>'Node Type' = 'Hash' then + return node; + else + for child in select json_array_elements(node->'Plans') + loop + x := find_hash(child); + if x is not null then + return x; + end if; + end loop; + return null; + end if; +end; +$$; +create or replace function hash_join_batches(query text) +returns table (original int, final int) language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + original := hash_node->>'Original Hash Batches'; + final := hash_node->>'Hash Batches'; + return next; + end loop; +end; +$$; + +-- Make a simple relation with well distributed keys and correctly +-- estimated size. +create table simple as + select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table simple set (parallel_workers = 2); +analyze simple; + +-- Make a relation whose size we will under-estimate. We want stats +-- to say 1000 rows, but actually there are 20,000 rows. +create table bigger_than_it_looks as + select generate_series(1, 20000) as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table bigger_than_it_looks set (autovacuum_enabled = 'false'); +alter table bigger_than_it_looks set (parallel_workers = 2); +analyze bigger_than_it_looks; +update pg_class set reltuples = 1000 where relname = 'bigger_than_it_looks'; + +-- Make a relation whose size we underestimate and that also has a +-- kind of skew that breaks our batching scheme. We want stats to say +-- 2 rows, but actually there are 20,000 rows with the same key. +create table extremely_skewed (id int, t text); +alter table extremely_skewed set (autovacuum_enabled = 'false'); +alter table extremely_skewed set (parallel_workers = 2); +analyze extremely_skewed; +insert into extremely_skewed + select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' + from generate_series(1, 20000); +update pg_class + set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 + where relname = 'extremely_skewed'; + +-- Make a relation with a couple of enormous tuples. +create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +alter table wide set (parallel_workers = 2); + +-- The "optimal" case: the hash table fits in memory; we plan for 1 +-- batch, we stick to that number, and peak memory usage stays within +-- our work_mem budget + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- The "good" case: batches required, but we plan the right number; we +-- plan for some number of batches, and we stick to that number, and +-- peak memory usage says within our work_mem budget + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '192kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +-- parallel full multi-batch hash join +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- The "bad" case: during execution we need to increase number of +-- batches; in this case we plan for 1 batch, and increase at least a +-- couple of times, and peak memory usage stays within our work_mem +-- budget + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); +select count(*) from simple r join bigger_than_it_looks s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); +rollback to settings; + +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local work_mem = '192kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); +select count(*) from simple r join bigger_than_it_looks s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); +rollback to settings; + +-- The "ugly" case: increasing the number of batches during execution +-- doesn't help, so stop trying to fit in work_mem and hope for the +-- best; in this case we plan for 1 batch, increases just once and +-- then stop increasing because that didn't help at all, so we blow +-- right through the work_mem budget and hope for the best... + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); +select count(*) from simple r join extremely_skewed s using (id); +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); +select count(*) from simple r join extremely_skewed s using (id); +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); +rollback to settings; + +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); +select count(*) from simple r join extremely_skewed s using (id); +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); +rollback to settings; + +-- A couple of other hash join tests unrelated to work_mem management. + +-- Check that EXPLAIN ANALYZE has data even if the leader doesn't participate +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local parallel_leader_participation = off; +select * from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- Exercise rescans. We'll turn off parallel_leader_participation so +-- that we can check that instrumentation comes back correctly. + +create table join_foo as select generate_series(1, 3) as id, 'xxxxx'::text as t; +alter table join_foo set (parallel_workers = 0); +create table join_bar as select generate_series(1, 10000) as id, 'xxxxx'::text as t; +alter table join_bar set (parallel_workers = 2); + +-- multi-batch with rescan, parallel-oblivious +savepoint settings; +set enable_parallel_hash = off; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '64kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); +rollback to settings; + +-- single-batch with rescan, parallel-oblivious +savepoint settings; +set enable_parallel_hash = off; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '4MB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); +rollback to settings; + +-- multi-batch with rescan, parallel-aware +savepoint settings; +set enable_parallel_hash = on; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '64kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); +rollback to settings; + +-- single-batch with rescan, parallel-aware +savepoint settings; +set enable_parallel_hash = on; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '4MB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); +rollback to settings; + +-- A full outer join where every record is matched. + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- parallelism not possible with parallel-oblivious full hash join +savepoint settings; +set enable_parallel_hash = off; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- parallelism is possible with parallel-aware full hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- A full outer join where every record is not matched. + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +rollback to settings; + +-- parallelism not possible with parallel-oblivious full hash join +savepoint settings; +set enable_parallel_hash = off; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +rollback to settings; + +-- parallelism is possible with parallel-aware full hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +rollback to settings; + + +-- exercise special code paths for huge tuples (note use of non-strict +-- expression and left join required to get the detoasted tuple into +-- the hash table) + +-- parallel with parallel-aware hash join (hits ExecParallelHashLoadTuple and +-- sts_puttuple oversized tuple cases because it's multi-batch) +savepoint settings; +set max_parallel_workers_per_gather = 2; +set enable_parallel_hash = on; +set work_mem = '128kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select length(max(s.t)) + from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +select length(max(s.t)) +from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +select final > 1 as multibatch + from hash_join_batches( +$$ + select length(max(s.t)) + from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +$$); +rollback to settings; + + +-- Hash join reuses the HOT status bit to indicate match status. This can only +-- be guaranteed to produce correct results if all the hash join tuple match +-- bits are reset before reuse. This is done upon loading them into the +-- hashtable. +SAVEPOINT settings; +SET enable_parallel_hash = on; +SET min_parallel_table_scan_size = 0; +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +CREATE TABLE hjtest_matchbits_t1(id int); +CREATE TABLE hjtest_matchbits_t2(id int); +INSERT INTO hjtest_matchbits_t1 VALUES (1); +INSERT INTO hjtest_matchbits_t2 VALUES (2); +-- Update should create a HOT tuple. If this status bit isn't cleared, we won't +-- correctly emit the NULL-extended unmatching tuple in full hash join. +UPDATE hjtest_matchbits_t2 set id = 2; +SELECT * FROM hjtest_matchbits_t1 t1 FULL JOIN hjtest_matchbits_t2 t2 ON t1.id = t2.id + ORDER BY t1.id; +-- Test serial full hash join. +-- Resetting parallel_setup_cost should force a serial plan. +-- Just to be safe, however, set enable_parallel_hash to off, as parallel full +-- hash joins are only supported with shared hashtables. +RESET parallel_setup_cost; +SET enable_parallel_hash = off; +SELECT * FROM hjtest_matchbits_t1 t1 FULL JOIN hjtest_matchbits_t2 t2 ON t1.id = t2.id; +ROLLBACK TO settings; + +rollback; + +-- Verify that hash key expressions reference the correct +-- nodes. Hashjoin's hashkeys need to reference its outer plan, Hash's +-- need to reference Hash's outer plan (which is below HashJoin's +-- inner plan). It's not trivial to verify that the references are +-- correct (we don't display the hashkeys themselves), but if the +-- hashkeys contain subplan references, those will be displayed. Force +-- subplans to appear just about everywhere. +-- +-- Bug report: +-- https://www.postgresql.org/message-id/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR%2BteQ_8tEXU8mxg%40mail.gmail.com +-- +BEGIN; +SET LOCAL enable_sort = OFF; -- avoid mergejoins +SET LOCAL from_collapse_limit = 1; -- allows easy changing of join order + +CREATE TABLE hjtest_1 (a text, b int, id int, c bool); +CREATE TABLE hjtest_2 (a bool, id int, b text, c int); + +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 2, 1, false); -- matches +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 1, 2, false); -- fails id join condition +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 20, 1, false); -- fails < 50 +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 1, 1, false); -- fails (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 2); -- matches +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 3, 'another', 7); -- fails id join condition +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 90); -- fails < 55 +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 3); -- fails (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'text', 1); -- fails hjtest_1.a <> hjtest_2.b; + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_1, hjtest_2 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_1, hjtest_2 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_2, hjtest_1 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_2, hjtest_1 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + +ROLLBACK; + +-- Verify that we behave sanely when the inner hash keys contain parameters +-- (that is, outer or lateral references). This situation has to defeat +-- re-use of the inner hash table across rescans. +begin; +set local enable_hashjoin = on; + +explain (costs off) +select i8.q2, ss.* from +int8_tbl i8, +lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 + on t1.fivethous = i4.f1+i8.q2 order by 1,2) ss; + +select i8.q2, ss.* from +int8_tbl i8, +lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 + on t1.fivethous = i4.f1+i8.q2 order by 1,2) ss; + +rollback; diff --git a/src/test/regress/sql/join_hash_pglz.sql b/src/test/regress/sql/join_hash_pglz.sql new file mode 100644 index 000000000000..2686afab272d --- /dev/null +++ b/src/test/regress/sql/join_hash_pglz.sql @@ -0,0 +1,626 @@ +-- +-- exercises for the hash join code +-- + +begin; + +set local min_parallel_table_scan_size = 0; +set local parallel_setup_cost = 0; +set local enable_hashjoin = on; +set local temp_file_compression = 'pglz'; + +-- Extract bucket and batch counts from an explain analyze plan. In +-- general we can't make assertions about how many batches (or +-- buckets) will be required because it can vary, but we can in some +-- special cases and we can check for growth. +create or replace function find_hash(node json) +returns json language plpgsql +as +$$ +declare + x json; + child json; +begin + if node->>'Node Type' = 'Hash' then + return node; + else + for child in select json_array_elements(node->'Plans') + loop + x := find_hash(child); + if x is not null then + return x; + end if; + end loop; + return null; + end if; +end; +$$; +create or replace function hash_join_batches(query text) +returns table (original int, final int) language plpgsql +as +$$ +declare + whole_plan json; + hash_node json; +begin + for whole_plan in + execute 'explain (analyze, format ''json'') ' || query + loop + hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan')); + original := hash_node->>'Original Hash Batches'; + final := hash_node->>'Hash Batches'; + return next; + end loop; +end; +$$; + +-- Make a simple relation with well distributed keys and correctly +-- estimated size. +create table simple as + select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table simple set (parallel_workers = 2); +analyze simple; + +-- Make a relation whose size we will under-estimate. We want stats +-- to say 1000 rows, but actually there are 20,000 rows. +create table bigger_than_it_looks as + select generate_series(1, 20000) as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'; +alter table bigger_than_it_looks set (autovacuum_enabled = 'false'); +alter table bigger_than_it_looks set (parallel_workers = 2); +analyze bigger_than_it_looks; +update pg_class set reltuples = 1000 where relname = 'bigger_than_it_looks'; + +-- Make a relation whose size we underestimate and that also has a +-- kind of skew that breaks our batching scheme. We want stats to say +-- 2 rows, but actually there are 20,000 rows with the same key. +create table extremely_skewed (id int, t text); +alter table extremely_skewed set (autovacuum_enabled = 'false'); +alter table extremely_skewed set (parallel_workers = 2); +analyze extremely_skewed; +insert into extremely_skewed + select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' + from generate_series(1, 20000); +update pg_class + set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192 + where relname = 'extremely_skewed'; + +-- Make a relation with a couple of enormous tuples. +create table wide as select generate_series(1, 2) as id, rpad('', 320000, 'x') as t; +alter table wide set (parallel_workers = 2); + +-- The "optimal" case: the hash table fits in memory; we plan for 1 +-- batch, we stick to that number, and peak memory usage stays within +-- our work_mem budget + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- The "good" case: batches required, but we plan the right number; we +-- plan for some number of batches, and we stick to that number, and +-- peak memory usage says within our work_mem budget + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '192kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join simple s using (id); +select count(*) from simple r join simple s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +-- parallel full multi-batch hash join +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- The "bad" case: during execution we need to increase number of +-- batches; in this case we plan for 1 batch, and increase at least a +-- couple of times, and peak memory usage stays within our work_mem +-- budget + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); +select count(*) from simple r join bigger_than_it_looks s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); +rollback to settings; + +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local work_mem = '192kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join bigger_than_it_looks s using (id); +select count(*) from simple r join bigger_than_it_looks s using (id); +select original > 1 as initially_multibatch, final > original as increased_batches + from hash_join_batches( +$$ + select count(*) from simple r join bigger_than_it_looks s using (id); +$$); +rollback to settings; + +-- The "ugly" case: increasing the number of batches during execution +-- doesn't help, so stop trying to fit in work_mem and hope for the +-- best; in this case we plan for 1 batch, increases just once and +-- then stop increasing because that didn't help at all, so we blow +-- right through the work_mem budget and hope for the best... + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); +select count(*) from simple r join extremely_skewed s using (id); +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); +rollback to settings; + +-- parallel with parallel-oblivious hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = off; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); +select count(*) from simple r join extremely_skewed s using (id); +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); +rollback to settings; + +-- parallel with parallel-aware hash join +savepoint settings; +set local max_parallel_workers_per_gather = 1; +set local work_mem = '128kB'; +set local hash_mem_multiplier = 1.0; +set local enable_parallel_hash = on; +explain (costs off) + select count(*) from simple r join extremely_skewed s using (id); +select count(*) from simple r join extremely_skewed s using (id); +select * from hash_join_batches( +$$ + select count(*) from simple r join extremely_skewed s using (id); +$$); +rollback to settings; + +-- A couple of other hash join tests unrelated to work_mem management. + +-- Check that EXPLAIN ANALYZE has data even if the leader doesn't participate +savepoint settings; +set local max_parallel_workers_per_gather = 2; +set local work_mem = '4MB'; +set local hash_mem_multiplier = 1.0; +set local parallel_leader_participation = off; +select * from hash_join_batches( +$$ + select count(*) from simple r join simple s using (id); +$$); +rollback to settings; + +-- Exercise rescans. We'll turn off parallel_leader_participation so +-- that we can check that instrumentation comes back correctly. + +create table join_foo as select generate_series(1, 3) as id, 'xxxxx'::text as t; +alter table join_foo set (parallel_workers = 0); +create table join_bar as select generate_series(1, 10000) as id, 'xxxxx'::text as t; +alter table join_bar set (parallel_workers = 2); + +-- multi-batch with rescan, parallel-oblivious +savepoint settings; +set enable_parallel_hash = off; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '64kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); +rollback to settings; + +-- single-batch with rescan, parallel-oblivious +savepoint settings; +set enable_parallel_hash = off; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '4MB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); +rollback to settings; + +-- multi-batch with rescan, parallel-aware +savepoint settings; +set enable_parallel_hash = on; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '64kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); +rollback to settings; + +-- single-batch with rescan, parallel-aware +savepoint settings; +set enable_parallel_hash = on; +set parallel_leader_participation = off; +set min_parallel_table_scan_size = 0; +set parallel_setup_cost = 0; +set parallel_tuple_cost = 0; +set max_parallel_workers_per_gather = 2; +set enable_material = off; +set enable_mergejoin = off; +set work_mem = '4MB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +select final > 1 as multibatch + from hash_join_batches( +$$ + select count(*) from join_foo + left join (select b1.id, b1.t from join_bar b1 join join_bar b2 using (id)) ss + on join_foo.id < ss.id + 1 and join_foo.id > ss.id - 1; +$$); +rollback to settings; + +-- A full outer join where every record is matched. + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- parallelism not possible with parallel-oblivious full hash join +savepoint settings; +set enable_parallel_hash = off; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- parallelism is possible with parallel-aware full hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s using (id); +select count(*) from simple r full outer join simple s using (id); +rollback to settings; + +-- A full outer join where every record is not matched. + +-- non-parallel +savepoint settings; +set local max_parallel_workers_per_gather = 0; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +rollback to settings; + +-- parallelism not possible with parallel-oblivious full hash join +savepoint settings; +set enable_parallel_hash = off; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +rollback to settings; + +-- parallelism is possible with parallel-aware full hash join +savepoint settings; +set local max_parallel_workers_per_gather = 2; +explain (costs off) + select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +select count(*) from simple r full outer join simple s on (r.id = 0 - s.id); +rollback to settings; + + +-- exercise special code paths for huge tuples (note use of non-strict +-- expression and left join required to get the detoasted tuple into +-- the hash table) + +-- parallel with parallel-aware hash join (hits ExecParallelHashLoadTuple and +-- sts_puttuple oversized tuple cases because it's multi-batch) +savepoint settings; +set max_parallel_workers_per_gather = 2; +set enable_parallel_hash = on; +set work_mem = '128kB'; +set hash_mem_multiplier = 1.0; +explain (costs off) + select length(max(s.t)) + from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +select length(max(s.t)) +from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +select final > 1 as multibatch + from hash_join_batches( +$$ + select length(max(s.t)) + from wide left join (select id, coalesce(t, '') || '' as t from wide) s using (id); +$$); +rollback to settings; + + +-- Hash join reuses the HOT status bit to indicate match status. This can only +-- be guaranteed to produce correct results if all the hash join tuple match +-- bits are reset before reuse. This is done upon loading them into the +-- hashtable. +SAVEPOINT settings; +SET enable_parallel_hash = on; +SET min_parallel_table_scan_size = 0; +SET parallel_setup_cost = 0; +SET parallel_tuple_cost = 0; +CREATE TABLE hjtest_matchbits_t1(id int); +CREATE TABLE hjtest_matchbits_t2(id int); +INSERT INTO hjtest_matchbits_t1 VALUES (1); +INSERT INTO hjtest_matchbits_t2 VALUES (2); +-- Update should create a HOT tuple. If this status bit isn't cleared, we won't +-- correctly emit the NULL-extended unmatching tuple in full hash join. +UPDATE hjtest_matchbits_t2 set id = 2; +SELECT * FROM hjtest_matchbits_t1 t1 FULL JOIN hjtest_matchbits_t2 t2 ON t1.id = t2.id + ORDER BY t1.id; +-- Test serial full hash join. +-- Resetting parallel_setup_cost should force a serial plan. +-- Just to be safe, however, set enable_parallel_hash to off, as parallel full +-- hash joins are only supported with shared hashtables. +RESET parallel_setup_cost; +SET enable_parallel_hash = off; +SELECT * FROM hjtest_matchbits_t1 t1 FULL JOIN hjtest_matchbits_t2 t2 ON t1.id = t2.id; +ROLLBACK TO settings; + +rollback; + +-- Verify that hash key expressions reference the correct +-- nodes. Hashjoin's hashkeys need to reference its outer plan, Hash's +-- need to reference Hash's outer plan (which is below HashJoin's +-- inner plan). It's not trivial to verify that the references are +-- correct (we don't display the hashkeys themselves), but if the +-- hashkeys contain subplan references, those will be displayed. Force +-- subplans to appear just about everywhere. +-- +-- Bug report: +-- https://www.postgresql.org/message-id/CAPpHfdvGVegF_TKKRiBrSmatJL2dR9uwFCuR%2BteQ_8tEXU8mxg%40mail.gmail.com +-- +BEGIN; +SET LOCAL enable_sort = OFF; -- avoid mergejoins +SET LOCAL from_collapse_limit = 1; -- allows easy changing of join order + +CREATE TABLE hjtest_1 (a text, b int, id int, c bool); +CREATE TABLE hjtest_2 (a bool, id int, b text, c int); + +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 2, 1, false); -- matches +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 1, 2, false); -- fails id join condition +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 20, 1, false); -- fails < 50 +INSERT INTO hjtest_1(a, b, id, c) VALUES ('text', 1, 1, false); -- fails (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 2); -- matches +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 3, 'another', 7); -- fails id join condition +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 90); -- fails < 55 +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'another', 3); -- fails (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) +INSERT INTO hjtest_2(a, id, b, c) VALUES (true, 1, 'text', 1); -- fails hjtest_1.a <> hjtest_2.b; + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_1, hjtest_2 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_1, hjtest_2 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + +EXPLAIN (COSTS OFF, VERBOSE) +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_2, hjtest_1 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + +SELECT hjtest_1.a a1, hjtest_2.a a2,hjtest_1.tableoid::regclass t1, hjtest_2.tableoid::regclass t2 +FROM hjtest_2, hjtest_1 +WHERE + hjtest_1.id = (SELECT 1 WHERE hjtest_2.id = 1) + AND (SELECT hjtest_1.b * 5) = (SELECT hjtest_2.c*5) + AND (SELECT hjtest_1.b * 5) < 50 + AND (SELECT hjtest_2.c * 5) < 55 + AND hjtest_1.a <> hjtest_2.b; + +ROLLBACK; + +-- Verify that we behave sanely when the inner hash keys contain parameters +-- (that is, outer or lateral references). This situation has to defeat +-- re-use of the inner hash table across rescans. +begin; +set local enable_hashjoin = on; + +explain (costs off) +select i8.q2, ss.* from +int8_tbl i8, +lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 + on t1.fivethous = i4.f1+i8.q2 order by 1,2) ss; + +select i8.q2, ss.* from +int8_tbl i8, +lateral (select t1.fivethous, i4.f1 from tenk1 t1 join int4_tbl i4 + on t1.fivethous = i4.f1+i8.q2 order by 1,2) ss; + +rollback;