diff options
Diffstat (limited to 'src/backend')
34 files changed, 1952 insertions, 30 deletions
diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 21721b48f04..bd93a6a8d1e 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -8,6 +8,7 @@ subdir = src/backend/access top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist transam +SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ + tablesample transam include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 1a8d2f2d0b5..f0c2394e600 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -80,8 +80,9 @@ bool synchronize_seqscans = true; static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, - bool is_bitmapscan, bool temp_snap); + bool allow_strat, bool allow_sync, bool allow_pagemode, + bool is_bitmapscan, bool is_samplescan, + bool temp_snap); static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options); static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, @@ -294,9 +295,10 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) /* * Currently, we don't have a stats counter for bitmap heap scans (but the - * underlying bitmap index scans will be counted). + * underlying bitmap index scans will be counted) or sample scans (we only + * update stats for tuple fetches there) */ - if (!scan->rs_bitmapscan) + if (!scan->rs_bitmapscan && !scan->rs_samplescan) pgstat_count_heap_scan(scan->rs_rd); } @@ -315,7 +317,7 @@ heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk, BlockNumber numBlks) * In page-at-a-time mode it performs additional work, namely determining * which tuples on the page are visible. */ -static void +void heapgetpage(HeapScanDesc scan, BlockNumber page) { Buffer buffer; @@ -1310,6 +1312,9 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode, * HeapScanDesc for a bitmap heap scan. Although that scan technology is * really quite unlike a standard seqscan, there is just enough commonality * to make it worth using the same data structure. + * + * heap_beginscan_samplingscan is alternate entry point for setting up a + * HeapScanDesc for a TABLESAMPLE scan. * ---------------- */ HeapScanDesc @@ -1317,7 +1322,7 @@ heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - true, true, false, false); + true, true, true, false, false, false); } HeapScanDesc @@ -1327,7 +1332,7 @@ heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key) Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid)); return heap_beginscan_internal(relation, snapshot, nkeys, key, - true, true, false, true); + true, true, true, false, false, true); } HeapScanDesc @@ -1336,7 +1341,8 @@ heap_beginscan_strat(Relation relation, Snapshot snapshot, bool allow_strat, bool allow_sync) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - allow_strat, allow_sync, false, false); + allow_strat, allow_sync, true, + false, false, false); } HeapScanDesc @@ -1344,14 +1350,24 @@ heap_beginscan_bm(Relation relation, Snapshot snapshot, int nkeys, ScanKey key) { return heap_beginscan_internal(relation, snapshot, nkeys, key, - false, false, true, false); + false, false, true, true, false, false); +} + +HeapScanDesc +heap_beginscan_sampling(Relation relation, Snapshot snapshot, + int nkeys, ScanKey key, + bool allow_strat, bool allow_pagemode) +{ + return heap_beginscan_internal(relation, snapshot, nkeys, key, + allow_strat, false, allow_pagemode, + false, true, false); } static HeapScanDesc heap_beginscan_internal(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, - bool allow_strat, bool allow_sync, - bool is_bitmapscan, bool temp_snap) + bool allow_strat, bool allow_sync, bool allow_pagemode, + bool is_bitmapscan, bool is_samplescan, bool temp_snap) { HeapScanDesc scan; @@ -1373,6 +1389,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot, scan->rs_snapshot = snapshot; scan->rs_nkeys = nkeys; scan->rs_bitmapscan = is_bitmapscan; + scan->rs_samplescan = is_samplescan; scan->rs_strategy = NULL; /* set in initscan */ scan->rs_allow_strat = allow_strat; scan->rs_allow_sync = allow_sync; @@ -1381,7 +1398,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot, /* * we can use page-at-a-time mode if it's an MVCC-safe snapshot */ - scan->rs_pageatatime = IsMVCCSnapshot(snapshot); + scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot); /* * For a seqscan in a serializable transaction, acquire a predicate lock diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile new file mode 100644 index 00000000000..46eeb59f9c4 --- /dev/null +++ b/src/backend/access/tablesample/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for utils/tablesample +# +# IDENTIFICATION +# src/backend/utils/tablesample/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/tablesample +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = tablesample.o system.o bernoulli.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c new file mode 100644 index 00000000000..c91f3f593e5 --- /dev/null +++ b/src/backend/access/tablesample/bernoulli.c @@ -0,0 +1,235 @@ +/*------------------------------------------------------------------------- + * + * bernoulli.c + * interface routines for BERNOULLI tablesample method + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/tablesample/bernoulli.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" + +#include "access/tablesample.h" +#include "access/relscan.h" +#include "nodes/execnodes.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "storage/bufmgr.h" +#include "utils/sampling.h" + + +/* tsdesc */ +typedef struct +{ + uint32 seed; /* random seed */ + BlockNumber startblock; /* starting block, we use ths for syncscan support */ + BlockNumber nblocks; /* number of blocks */ + BlockNumber blockno; /* current block */ + float4 probability; /* probabilty that tuple will be returned (0.0-1.0) */ + OffsetNumber lt; /* last tuple returned from current block */ + SamplerRandomState randstate; /* random generator tsdesc */ +} BernoulliSamplerData; + +/* + * Initialize the state. + */ +Datum +tsm_bernoulli_init(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + uint32 seed = PG_GETARG_UINT32(1); + float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); + HeapScanDesc scan = tsdesc->heapScan; + BernoulliSamplerData *sampler; + + if (percent < 0 || percent > 100) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("invalid sample size"), + errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + + sampler = palloc0(sizeof(BernoulliSamplerData)); + + /* Remember initial values for reinit */ + sampler->seed = seed; + sampler->startblock = scan->rs_startblock; + sampler->nblocks = scan->rs_nblocks; + sampler->blockno = InvalidBlockNumber; + sampler->probability = percent / 100; + sampler->lt = InvalidOffsetNumber; + sampler_random_init_state(sampler->seed, sampler->randstate); + + tsdesc->tsmdata = (void *) sampler; + + PG_RETURN_VOID(); +} + +/* + * Get next block number to read or InvalidBlockNumber if we are at the + * end of the relation. + */ +Datum +tsm_bernoulli_nextblock(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + + /* + * Bernoulli sampling scans all blocks on the table and supports + * syncscan so loop from startblock to startblock instead of + * from 0 to nblocks. + */ + if (sampler->blockno == InvalidBlockNumber) + sampler->blockno = sampler->startblock; + else + { + sampler->blockno++; + + if (sampler->blockno >= sampler->nblocks) + sampler->blockno = 0; + + if (sampler->blockno == sampler->startblock) + PG_RETURN_UINT32(InvalidBlockNumber); + } + + PG_RETURN_UINT32(sampler->blockno); +} + +/* + * Get next tuple from current block. + * + * This method implements the main logic in bernoulli sampling. + * The algorithm simply generates new random number (in 0.0-1.0 range) and if + * it falls within user specified probability (in the same range) return the + * tuple offset. + * + * It is ok here to return tuple offset without knowing if tuple is visible + * and not check it via examinetuple. The reason for that is that we do the + * coinflip (random number generation) for every tuple in the table. Since all + * tuples have same probability of being returned the visible and invisible + * tuples will be returned in same ratio as they have in the actual table. + * This means that there is no skew towards either visible or invisible tuples + * and the number returned visible tuples to from the executor node is the + * fraction of visible tuples which was specified in input. + * + * This is faster than doing the coinflip in the examinetuple because we don't + * have to do visibility checks on uninteresting tuples. + * + * If we reach end of the block return InvalidOffsetNumber which tells + * SampleScan to go to next block. + */ +Datum +tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + OffsetNumber maxoffset = PG_GETARG_UINT16(2); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + OffsetNumber tupoffset = sampler->lt; + float4 probability = sampler->probability; + + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + /* + * Loop over tuple offsets until the random generator returns value that + * is within the probability of returning the tuple or until we reach + * end of the block. + * + * (This is our implementation of bernoulli trial) + */ + while (sampler_random_fract(sampler->randstate) > probability) + { + tupoffset++; + + if (tupoffset > maxoffset) + break; + } + + if (tupoffset > maxoffset) + /* Tell SampleScan that we want next block. */ + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + PG_RETURN_UINT16(tupoffset); +} + +/* + * Cleanup method. + */ +Datum +tsm_bernoulli_end(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + + pfree(tsdesc->tsmdata); + + PG_RETURN_VOID(); +} + +/* + * Reset tsdesc (called by ReScan). + */ +Datum +tsm_bernoulli_reset(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + BernoulliSamplerData *sampler = + (BernoulliSamplerData *) tsdesc->tsmdata; + + sampler->blockno = InvalidBlockNumber; + sampler->lt = InvalidOffsetNumber; + sampler_random_init_state(sampler->seed, sampler->randstate); + + PG_RETURN_VOID(); +} + +/* + * Costing function. + */ +Datum +tsm_bernoulli_cost(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Path *path = (Path *) PG_GETARG_POINTER(1); + RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); + List *args = (List *) PG_GETARG_POINTER(3); + BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); + double *tuples = (double *) PG_GETARG_POINTER(5); + Node *pctnode; + float4 samplesize; + + *pages = baserel->pages; + + pctnode = linitial(args); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, RelabelType)) + pctnode = (Node *) ((RelabelType *) pctnode)->arg; + + if (IsA(pctnode, Const)) + { + samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); + samplesize /= 100.0; + } + else + { + /* Default samplesize if the estimation didn't return Const. */ + samplesize = 0.1f; + } + + *tuples = path->rows * samplesize; + path->rows = *tuples; + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c new file mode 100644 index 00000000000..1412e511faf --- /dev/null +++ b/src/backend/access/tablesample/system.c @@ -0,0 +1,186 @@ +/*------------------------------------------------------------------------- + * + * system.c + * interface routines for system tablesample method + * + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/tablesample/system.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" + +#include "access/tablesample.h" +#include "access/relscan.h" +#include "nodes/execnodes.h" +#include "nodes/relation.h" +#include "optimizer/clauses.h" +#include "storage/bufmgr.h" +#include "utils/sampling.h" + + +/* + * State + */ +typedef struct +{ + BlockSamplerData bs; + uint32 seed; /* random seed */ + BlockNumber nblocks; /* number of block in relation */ + int samplesize; /* number of blocks to return */ + OffsetNumber lt; /* last tuple returned from current block */ +} SystemSamplerData; + + +/* + * Initializes the state. + */ +Datum +tsm_system_init(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + uint32 seed = PG_GETARG_UINT32(1); + float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2); + HeapScanDesc scan = tsdesc->heapScan; + SystemSamplerData *sampler; + + if (percent < 0 || percent > 100) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("invalid sample size"), + errhint("Sample size must be numeric value between 0 and 100 (inclusive)."))); + + sampler = palloc0(sizeof(SystemSamplerData)); + + /* Remember initial values for reinit */ + sampler->seed = seed; + sampler->nblocks = scan->rs_nblocks; + sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0)); + sampler->lt = InvalidOffsetNumber; + + BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, + sampler->seed); + + tsdesc->tsmdata = (void *) sampler; + + PG_RETURN_VOID(); +} + +/* + * Get next block number or InvalidBlockNumber when we're done. + * + * Uses the same logic as ANALYZE for picking the random blocks. + */ +Datum +tsm_system_nextblock(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + BlockNumber blockno; + + if (!BlockSampler_HasMore(&sampler->bs)) + PG_RETURN_UINT32(InvalidBlockNumber); + + blockno = BlockSampler_Next(&sampler->bs); + + PG_RETURN_UINT32(blockno); +} + +/* + * Get next tuple offset in current block or InvalidOffsetNumber if we are done + * with this block. + */ +Datum +tsm_system_nexttuple(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + OffsetNumber maxoffset = PG_GETARG_UINT16(2); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + OffsetNumber tupoffset = sampler->lt; + + if (tupoffset == InvalidOffsetNumber) + tupoffset = FirstOffsetNumber; + else + tupoffset++; + + if (tupoffset > maxoffset) + tupoffset = InvalidOffsetNumber; + + sampler->lt = tupoffset; + + PG_RETURN_UINT16(tupoffset); +} + +/* + * Cleanup method. + */ +Datum +tsm_system_end(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + + pfree(tsdesc->tsmdata); + + PG_RETURN_VOID(); +} + +/* + * Reset state (called by ReScan). + */ +Datum +tsm_system_reset(PG_FUNCTION_ARGS) +{ + TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); + SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; + + sampler->lt = InvalidOffsetNumber; + BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize, + sampler->seed); + + PG_RETURN_VOID(); +} + +/* + * Costing function. + */ +Datum +tsm_system_cost(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + Path *path = (Path *) PG_GETARG_POINTER(1); + RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2); + List *args = (List *) PG_GETARG_POINTER(3); + BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4); + double *tuples = (double *) PG_GETARG_POINTER(5); + Node *pctnode; + float4 samplesize; + + pctnode = linitial(args); + pctnode = estimate_expression_value(root, pctnode); + + if (IsA(pctnode, RelabelType)) + pctnode = (Node *) ((RelabelType *) pctnode)->arg; + + if (IsA(pctnode, Const)) + { + samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue); + samplesize /= 100.0; + } + else + { + /* Default samplesize if the estimation didn't return Const. */ + samplesize = 0.1f; + } + + *pages = baserel->pages * samplesize; + *tuples = path->rows * samplesize; + path->rows = *tuples; + + PG_RETURN_VOID(); +} diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c new file mode 100644 index 00000000000..ef55d062e75 --- /dev/null +++ b/src/backend/access/tablesample/tablesample.c @@ -0,0 +1,368 @@ +/*------------------------------------------------------------------------- + * + * tablesample.c + * TABLESAMPLE internal API + * + * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/tablesample/tablesample.c + * + * TABLESAMPLE is the SQL standard clause for sampling the relations. + * + * The API is interface between the Executor and the TABLESAMPLE Methods. + * + * TABLESAMPLE Methods are implementations of actual sampling algorithms which + * can be used for returning a sample of the source relation. + * Methods don't read the table directly but are asked for block number and + * tuple offset which they want to examine (or return) and the tablesample + * interface implemented here does the reading for them. + * + * We currently only support sampling of the physical relations, but in the + * future we might extend the API to support subqueries as well. + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/tablesample.h" + +#include "catalog/pg_tablesample_method.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/rel.h" +#include "utils/tqual.h" + + +static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan); + + +/* + * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method. + */ +TableSampleDesc * +tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample) +{ + FunctionCallInfoData fcinfo; + int i; + List *args = tablesample->args; + ListCell *arg; + ExprContext *econtext = scanstate->ss.ps.ps_ExprContext; + TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc)); + + /* Load functions */ + fmgr_info(tablesample->tsminit, &(tsdesc->tsminit)); + fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock)); + fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple)); + if (OidIsValid(tablesample->tsmexaminetuple)) + fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple)); + else + tsdesc->tsmexaminetuple.fn_oid = InvalidOid; + fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset)); + fmgr_info(tablesample->tsmend, &(tsdesc->tsmend)); + + InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit, + list_length(args) + 2, + InvalidOid, NULL, NULL); + + tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor; + tsdesc->heapScan = scanstate->ss.ss_currentScanDesc; + + /* First argument for init function is always TableSampleDesc */ + fcinfo.arg[0] = PointerGetDatum(tsdesc); + fcinfo.argnull[0] = false; + + /* + * Second arg for init function is always REPEATABLE + * When tablesample->repeatable is NULL then REPEATABLE clause was not + * specified. + * When specified, the expression cannot evaluate to NULL. + */ + if (tablesample->repeatable) + { + ExprState *argstate = ExecInitExpr((Expr *) tablesample->repeatable, + (PlanState *) scanstate); + fcinfo.arg[1] = ExecEvalExpr(argstate, econtext, + &fcinfo.argnull[1], NULL); + if (fcinfo.argnull[1]) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("REPEATABLE clause must be NOT NULL numeric value"))); + } + else + { + fcinfo.arg[1] = UInt32GetDatum(random()); + fcinfo.argnull[1] = false; + } + + /* Rest of the arguments come from user. */ + i = 2; + foreach(arg, args) + { + Expr *argexpr = (Expr *) lfirst(arg); + ExprState *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate); + + if (argstate == NULL) + { + fcinfo.argnull[i] = true; + fcinfo.arg[i] = (Datum) 0;; + } + + fcinfo.arg[i] = ExecEvalExpr(argstate, econtext, + &fcinfo.argnull[i], NULL); + i++; + } + Assert(i == fcinfo.nargs); + + (void) FunctionCallInvoke(&fcinfo); + + return tsdesc; +} + +/* + * Get next tuple from TABLESAMPLE Method. + */ +HeapTuple +tablesample_getnext(TableSampleDesc *desc) +{ + HeapScanDesc scan = desc->heapScan; + HeapTuple tuple = &(scan->rs_ctup); + bool pagemode = scan->rs_pageatatime; + BlockNumber blockno; + Page page; + bool page_all_visible; + ItemId itemid; + OffsetNumber tupoffset, + maxoffset; + + if (!scan->rs_inited) + { + /* + * return null immediately if relation is empty + */ + if (scan->rs_nblocks == 0) + { + Assert(!BufferIsValid(scan->rs_cbuf)); + tuple->t_data = NULL; + return NULL; + } + blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, + PointerGetDatum(desc))); + if (!BlockNumberIsValid(blockno)) + { + tuple->t_data = NULL; + return NULL; + } + + heapgetpage(scan, blockno); + scan->rs_inited = true; + } + else + { + /* continue from previously returned page/tuple */ + blockno = scan->rs_cblock; /* current page */ + } + + /* + * When pagemode is disabled, the scan will do visibility checks for each + * tuple it finds so the buffer needs to be locked. + */ + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + page_all_visible = PageIsAllVisible(page); + maxoffset = PageGetMaxOffsetNumber(page); + + for (;;) + { + CHECK_FOR_INTERRUPTS(); + + tupoffset = DatumGetUInt16(FunctionCall3(&desc->tsmnexttuple, + PointerGetDatum(desc), + UInt32GetDatum(blockno), + UInt16GetDatum(maxoffset))); + + if (OffsetNumberIsValid(tupoffset)) + { + bool visible; + bool found; + + /* Skip invalid tuple pointers. */ + itemid = PageGetItemId(page, tupoffset); + if (!ItemIdIsNormal(itemid)) + continue; + + tuple->t_data = (HeapTupleHeader) PageGetItem((Page) page, itemid); + tuple->t_len = ItemIdGetLength(itemid); + ItemPointerSet(&(tuple->t_self), blockno, tupoffset); + + if (page_all_visible) + visible = true; + else + visible = SampleTupleVisible(tuple, tupoffset, scan); + + /* + * Let the sampling method examine the actual tuple and decide if we + * should return it. + * + * Note that we let it examine even invisible tuples for + * statistical purposes, but not return them since user should + * never see invisible tuples. + */ + if (OidIsValid(desc->tsmexaminetuple.fn_oid)) + { + found = DatumGetBool(FunctionCall4(&desc->tsmexaminetuple, + PointerGetDatum(desc), + UInt32GetDatum(blockno), + PointerGetDatum(tuple), + BoolGetDatum(visible))); + /* Should not happen if sampling method is well written. */ + if (found && !visible) + elog(ERROR, "Sampling method wanted to return invisible tuple"); + } + else + found = visible; + + /* Found visible tuple, return it. */ + if (found) + { + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + break; + } + else + { + /* Try next tuple from same page. */ + continue; + } + } + + + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); + + blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock, + PointerGetDatum(desc))); + + /* + * Report our new scan position for synchronization purposes. We + * don't do that when moving backwards, however. That would just + * mess up any other forward-moving scanners. + * + * Note: we do this before checking for end of scan so that the + * final state of the position hint is back at the start of the + * rel. That's not strictly necessary, but otherwise when you run + * the same query multiple times the starting position would shift + * a little bit backwards on every invocation, which is confusing. + * We don't guarantee any specific ordering in general, though. + */ + if (scan->rs_syncscan) + ss_report_location(scan->rs_rd, BlockNumberIsValid(blockno) ? + blockno : scan->rs_startblock); + + /* + * Reached end of scan. + */ + if (!BlockNumberIsValid(blockno)) + { + if (BufferIsValid(scan->rs_cbuf)) + ReleaseBuffer(scan->rs_cbuf); + scan->rs_cbuf = InvalidBuffer; + scan->rs_cblock = InvalidBlockNumber; + tuple->t_data = NULL; + scan->rs_inited = false; + return NULL; + } + + heapgetpage(scan, blockno); + + if (!pagemode) + LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); + + page = (Page) BufferGetPage(scan->rs_cbuf); + page_all_visible = PageIsAllVisible(page); + maxoffset = PageGetMaxOffsetNumber(page); + } + + pgstat_count_heap_getnext(scan->rs_rd); + + return &(scan->rs_ctup); +} + +/* + * Reset the sampling to starting state + */ +void +tablesample_reset(TableSampleDesc *desc) +{ + (void) FunctionCall1(&desc->tsmreset, PointerGetDatum(desc)); +} + +/* + * Signal the sampling method that the scan has finished. + */ +void +tablesample_end(TableSampleDesc *desc) +{ + (void) FunctionCall1(&desc->tsmend, PointerGetDatum(desc)); +} + +/* + * Check visibility of the tuple. + */ +static bool +SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan) +{ + /* + * If this scan is reading whole pages at a time, there is already + * visibility info present in rs_vistuples so we can just search it + * for the tupoffset. + */ + if (scan->rs_pageatatime) + { + int start = 0, + end = scan->rs_ntuples - 1; + + /* + * Do the binary search over rs_vistuples, it's already sorted by + * OffsetNumber so we don't need to do any sorting ourselves here. + * + * We could use bsearch() here but it's slower for integers because + * of the function call overhead and because it needs boiler plate code + * it would not save us anything code-wise anyway. + */ + while (start <= end) + { + int mid = start + (end - start) / 2; + OffsetNumber curoffset = scan->rs_vistuples[mid]; + + if (curoffset == tupoffset) + return true; + else if (curoffset > tupoffset) + end = mid - 1; + else + start = mid + 1; + } + + return false; + } + else + { + /* No pagemode, we have to check the tuple itself. */ + Snapshot snapshot = scan->rs_snapshot; + Buffer buffer = scan->rs_cbuf; + + bool visible = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer); + + CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, buffer, + snapshot); + + return visible; + } +} diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 37d05d1acc6..3d1139b5ba0 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -40,9 +40,8 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ pg_ts_parser.h pg_ts_template.h pg_extension.h \ pg_foreign_data_wrapper.h pg_foreign_server.h pg_user_mapping.h \ pg_foreign_table.h pg_policy.h pg_replication_origin.h \ - pg_default_acl.h pg_seclabel.h pg_shseclabel.h pg_collation.h pg_range.h \ - pg_transform.h \ - toasting.h indexing.h \ + pg_tablesample_method.h pg_default_acl.h pg_seclabel.h pg_shseclabel.h \ + pg_collation.h pg_range.h pg_transform.h toasting.h indexing.h \ ) # location of Catalog.pm diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 952cf204a0a..65e329eab07 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1150,7 +1150,7 @@ acquire_sample_rows(Relation onerel, int elevel, * Found a suitable tuple, so save it, replacing one * old tuple at random */ - int k = (int) (targrows * sampler_random_fract()); + int k = (int) (targrows * sampler_random_fract(rstate.randstate)); Assert(k >= 0 && k < targrows); heap_freetuple(rows[k]); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index eeb8f19017e..478771c6ba1 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -731,6 +731,7 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) case T_ValuesScan: case T_CteScan: case T_WorkTableScan: + case T_SampleScan: *rels_used = bms_add_member(*rels_used, ((Scan *) plan)->scanrelid); break; @@ -967,6 +968,21 @@ ExplainNode(PlanState *planstate, List *ancestors, else pname = sname; break; + case T_SampleScan: + { + /* + * Fetch the tablesample method name from RTE. + * + * It would be nice to also show parameters, but since we + * support arbitrary expressions as parameter it might get + * quite messy. + */ + RangeTblEntry *rte; + rte = rt_fetch(((SampleScan *) plan)->scanrelid, es->rtable); + custom_name = get_tablesample_method_name(rte->tablesample->tsmid); + pname = psprintf("Sample Scan (%s)", custom_name); + } + break; case T_Material: pname = sname = "Materialize"; break; @@ -1089,6 +1105,9 @@ ExplainNode(PlanState *planstate, List *ancestors, if (((Scan *) plan)->scanrelid > 0) ExplainScanTarget((Scan *) plan, es); break; + case T_SampleScan: + ExplainScanTarget((Scan *) plan, es); + break; case T_IndexScan: { IndexScan *indexscan = (IndexScan *) plan; @@ -1339,6 +1358,7 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_CteScan: case T_WorkTableScan: case T_SubqueryScan: + case T_SampleScan: show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); if (plan->qual) show_instrumentation_count("Rows Removed by Filter", 1, @@ -2238,6 +2258,7 @@ ExplainTargetRel(Plan *plan, Index rti, ExplainState *es) case T_TidScan: case T_ForeignScan: case T_CustomScan: + case T_SampleScan: case T_ModifyTable: /* Assert it's on a real relation */ Assert(rte->rtekind == RTE_RELATION); diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile index bc5d373d68a..08cba6fa2b5 100644 --- a/src/backend/executor/Makefile +++ b/src/backend/executor/Makefile @@ -21,7 +21,7 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execIndexing.o execJunk.o \ nodeLimit.o nodeLockRows.o \ nodeMaterial.o nodeMergeAppend.o nodeMergejoin.o nodeModifyTable.o \ nodeNestloop.o nodeFunctionscan.o nodeRecursiveunion.o nodeResult.o \ - nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \ + nodeSamplescan.o nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \ nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \ nodeGroup.o nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o \ nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o spi.o diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 6ebad2f03f0..4948a265cb2 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -39,6 +39,7 @@ #include "executor/nodeNestloop.h" #include "executor/nodeRecursiveunion.h" #include "executor/nodeResult.h" +#include "executor/nodeSamplescan.h" #include "executor/nodeSeqscan.h" #include "executor/nodeSetOp.h" #include "executor/nodeSort.h" @@ -155,6 +156,10 @@ ExecReScan(PlanState *node) ExecReScanSeqScan((SeqScanState *) node); break; + case T_SampleScanState: + ExecReScanSampleScan((SampleScanState *) node); + break; + case T_IndexScanState: ExecReScanIndexScan((IndexScanState *) node); break; @@ -480,6 +485,9 @@ ExecSupportsBackwardScan(Plan *node) } return false; + case T_SampleScan: + return false; + case T_Material: case T_Sort: /* these don't evaluate tlist */ diff --git a/src/backend/executor/execCurrent.c b/src/backend/executor/execCurrent.c index d87be963a95..bcd287f8742 100644 --- a/src/backend/executor/execCurrent.c +++ b/src/backend/executor/execCurrent.c @@ -261,6 +261,7 @@ search_plan_tree(PlanState *node, Oid table_oid) * Relation scan nodes can all be treated alike */ case T_SeqScanState: + case T_SampleScanState: case T_IndexScanState: case T_IndexOnlyScanState: case T_BitmapHeapScanState: diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 9892499fb7a..03c2febc3e1 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -102,6 +102,7 @@ #include "executor/nodeNestloop.h" #include "executor/nodeRecursiveunion.h" #include "executor/nodeResult.h" +#include "executor/nodeSamplescan.h" #include "executor/nodeSeqscan.h" #include "executor/nodeSetOp.h" #include "executor/nodeSort.h" @@ -190,6 +191,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags) estate, eflags); break; + case T_SampleScan: + result = (PlanState *) ExecInitSampleScan((SampleScan *) node, + estate, eflags); + break; + case T_IndexScan: result = (PlanState *) ExecInitIndexScan((IndexScan *) node, estate, eflags); @@ -406,6 +412,10 @@ ExecProcNode(PlanState *node) result = ExecSeqScan((SeqScanState *) node); break; + case T_SampleScanState: + result = ExecSampleScan((SampleScanState *) node); + break; + case T_IndexScanState: result = ExecIndexScan((IndexScanState *) node); break; @@ -644,6 +654,10 @@ ExecEndNode(PlanState *node) ExecEndSeqScan((SeqScanState *) node); break; + case T_SampleScanState: + ExecEndSampleScan((SampleScanState *) node); + break; + case T_IndexScanState: ExecEndIndexScan((IndexScanState *) node); break; diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c new file mode 100644 index 00000000000..fc89d1dca03 --- /dev/null +++ b/src/backend/executor/nodeSamplescan.c @@ -0,0 +1,256 @@ +/*------------------------------------------------------------------------- + * + * nodeSamplescan.c + * Support routines for sample scans of relations (table sampling). + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeSamplescan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/tablesample.h" +#include "executor/executor.h" +#include "executor/nodeSamplescan.h" +#include "miscadmin.h" +#include "parser/parsetree.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/tqual.h" + +static void InitScanRelation(SampleScanState *node, EState *estate, + int eflags, TableSampleClause *tablesample); +static TupleTableSlot *SampleNext(SampleScanState *node); + + +/* ---------------------------------------------------------------- + * Scan Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * SampleNext + * + * This is a workhorse for ExecSampleScan + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +SampleNext(SampleScanState *node) +{ + TupleTableSlot *slot; + TableSampleDesc *tsdesc; + HeapTuple tuple; + + /* + * get information from the scan state + */ + slot = node->ss.ss_ScanTupleSlot; + tsdesc = node->tsdesc; + + tuple = tablesample_getnext(tsdesc); + + if (tuple) + ExecStoreTuple(tuple, /* tuple to store */ + slot, /* slot to store in */ + tsdesc->heapScan->rs_cbuf, /* buffer associated with this tuple */ + false); /* don't pfree this pointer */ + else + ExecClearTuple(slot); + + return slot; +} + +/* + * SampleRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +SampleRecheck(SampleScanState *node, TupleTableSlot *slot) +{ + /* No need to recheck for SampleScan */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecSampleScan(node) + * + * Scans the relation using the sampling method and returns + * the next qualifying tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * ---------------------------------------------------------------- + */ +TupleTableSlot * +ExecSampleScan(SampleScanState *node) +{ + return ExecScan((ScanState *) node, + (ExecScanAccessMtd) SampleNext, + (ExecScanRecheckMtd) SampleRecheck); +} + +/* ---------------------------------------------------------------- + * InitScanRelation + * + * Set up to access the scan relation. + * ---------------------------------------------------------------- + */ +static void +InitScanRelation(SampleScanState *node, EState *estate, int eflags, + TableSampleClause *tablesample) +{ + Relation currentRelation; + + /* + * get the relation object id from the relid'th entry in the range table, + * open that relation and acquire appropriate lock on it. + */ + currentRelation = ExecOpenScanRelation(estate, + ((SampleScan *) node->ss.ps.plan)->scanrelid, + eflags); + + node->ss.ss_currentRelation = currentRelation; + + /* + * Even though we aren't going to do a conventional seqscan, it is useful + * to create a HeapScanDesc --- many of the fields in it are usable. + */ + node->ss.ss_currentScanDesc = + heap_beginscan_sampling(currentRelation, estate->es_snapshot, 0, NULL, + tablesample->tsmseqscan, + tablesample->tsmpagemode); + + /* and report the scan tuple slot's rowtype */ + ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation)); +} + + +/* ---------------------------------------------------------------- + * ExecInitSampleScan + * ---------------------------------------------------------------- + */ +SampleScanState * +ExecInitSampleScan(SampleScan *node, EState *estate, int eflags) +{ + SampleScanState *scanstate; + RangeTblEntry *rte = rt_fetch(node->scanrelid, + estate->es_range_table); + + Assert(outerPlan(node) == NULL); + Assert(innerPlan(node) == NULL); + Assert(rte->tablesample != NULL); + + /* + * create state structure + */ + scanstate = makeNode(SampleScanState); + scanstate->ss.ps.plan = (Plan *) node; + scanstate->ss.ps.state = estate; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &scanstate->ss.ps); + + /* + * initialize child expressions + */ + scanstate->ss.ps.targetlist = (List *) + ExecInitExpr((Expr *) node->plan.targetlist, + (PlanState *) scanstate); + scanstate->ss.ps.qual = (List *) + ExecInitExpr((Expr *) node->plan.qual, + (PlanState *) scanstate); + + /* + * tuple table initialization + */ + ExecInitResultTupleSlot(estate, &scanstate->ss.ps); + ExecInitScanTupleSlot(estate, &scanstate->ss); + + /* + * initialize scan relation + */ + InitScanRelation(scanstate, estate, eflags, rte->tablesample); + + scanstate->ss.ps.ps_TupFromTlist = false; + + /* + * Initialize result tuple type and projection info. + */ + ExecAssignResultTypeFromTL(&scanstate->ss.ps); + ExecAssignScanProjectionInfo(&scanstate->ss); + + scanstate->tsdesc = tablesample_init(scanstate, rte->tablesample); + + return scanstate; +} + +/* ---------------------------------------------------------------- + * ExecEndSampleScan + * + * frees any storage allocated through C routines. + * ---------------------------------------------------------------- + */ +void +ExecEndSampleScan(SampleScanState *node) +{ + /* + * Tell sampling function that we finished the scan. + */ + tablesample_end(node->tsdesc); + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clean out the tuple table + */ + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); + + /* + * close heap scan + */ + heap_endscan(node->ss.ss_currentScanDesc); + + /* + * close the heap relation. + */ + ExecCloseScanRelation(node->ss.ss_currentRelation); +} + +/* ---------------------------------------------------------------- + * Join Support + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * ExecReScanSampleScan + * + * Rescans the relation. + * + * ---------------------------------------------------------------- + */ +void +ExecReScanSampleScan(SampleScanState *node) +{ + heap_rescan(node->ss.ss_currentScanDesc, NULL); + + /* + * Tell sampling function to reset its state for rescan. + */ + tablesample_reset(node->tsdesc); + + ExecScanReScan(&node->ss); +} diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 25839eed945..bdc7e61935c 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -642,6 +642,22 @@ _copyCustomScan(const CustomScan *from) } /* + * _copySampleScan + */ +static SampleScan * +_copySampleScan(const SampleScan *from) +{ + SampleScan *newnode = makeNode(SampleScan); + + /* + * copy node superclass fields + */ + CopyScanFields((const Scan *) from, (Scan *) newnode); + + return newnode; +} + +/* * CopyJoinFields * * This function copies the fields of the Join node. It is used by @@ -2063,6 +2079,7 @@ _copyRangeTblEntry(const RangeTblEntry *from) COPY_SCALAR_FIELD(rtekind); COPY_SCALAR_FIELD(relid); COPY_SCALAR_FIELD(relkind); + COPY_NODE_FIELD(tablesample); COPY_NODE_FIELD(subquery); COPY_SCALAR_FIELD(security_barrier); COPY_SCALAR_FIELD(jointype); @@ -2224,6 +2241,40 @@ _copyCommonTableExpr(const CommonTableExpr *from) return newnode; } +static RangeTableSample * +_copyRangeTableSample(const RangeTableSample *from) +{ + RangeTableSample *newnode = makeNode(RangeTableSample); + + COPY_NODE_FIELD(relation); + COPY_STRING_FIELD(method); + COPY_NODE_FIELD(repeatable); + COPY_NODE_FIELD(args); + + return newnode; +} + +static TableSampleClause * +_copyTableSampleClause(const TableSampleClause *from) +{ + TableSampleClause *newnode = makeNode(TableSampleClause); + + COPY_SCALAR_FIELD(tsmid); + COPY_SCALAR_FIELD(tsmseqscan); + COPY_SCALAR_FIELD(tsmpagemode); + COPY_SCALAR_FIELD(tsminit); + COPY_SCALAR_FIELD(tsmnextblock); + COPY_SCALAR_FIELD(tsmnexttuple); + COPY_SCALAR_FIELD(tsmexaminetuple); + COPY_SCALAR_FIELD(tsmend); + COPY_SCALAR_FIELD(tsmreset); + COPY_SCALAR_FIELD(tsmcost); + COPY_NODE_FIELD(repeatable); + COPY_NODE_FIELD(args); + + return newnode; +} + static A_Expr * _copyAExpr(const A_Expr *from) { @@ -4179,6 +4230,9 @@ copyObject(const void *from) case T_CustomScan: retval = _copyCustomScan(from); break; + case T_SampleScan: + retval = _copySampleScan(from); + break; case T_Join: retval = _copyJoin(from); break; @@ -4842,6 +4896,12 @@ copyObject(const void *from) case T_CommonTableExpr: retval = _copyCommonTableExpr(from); break; + case T_RangeTableSample: + retval = _copyRangeTableSample(from); + break; + case T_TableSampleClause: + retval = _copyTableSampleClause(from); + break; case T_FuncWithArgs: retval = _copyFuncWithArgs(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index c4b3615caf3..d483221fb7a 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2359,6 +2359,7 @@ _equalRangeTblEntry(const RangeTblEntry *a, const RangeTblEntry *b) COMPARE_SCALAR_FIELD(rtekind); COMPARE_SCALAR_FIELD(relid); COMPARE_SCALAR_FIELD(relkind); + COMPARE_NODE_FIELD(tablesample); COMPARE_NODE_FIELD(subquery); COMPARE_SCALAR_FIELD(security_barrier); COMPARE_SCALAR_FIELD(jointype); @@ -2503,6 +2504,36 @@ _equalCommonTableExpr(const CommonTableExpr *a, const CommonTableExpr *b) } static bool +_equalRangeTableSample(const RangeTableSample *a, const RangeTableSample *b) +{ + COMPARE_NODE_FIELD(relation); + COMPARE_STRING_FIELD(method); + COMPARE_NODE_FIELD(repeatable); + COMPARE_NODE_FIELD(args); + + return true; +} + +static bool +_equalTableSampleClause(const TableSampleClause *a, const TableSampleClause *b) +{ + COMPARE_SCALAR_FIELD(tsmid); + COMPARE_SCALAR_FIELD(tsmseqscan); + COMPARE_SCALAR_FIELD(tsmpagemode); + COMPARE_SCALAR_FIELD(tsminit); + COMPARE_SCALAR_FIELD(tsmnextblock); + COMPARE_SCALAR_FIELD(tsmnexttuple); + COMPARE_SCALAR_FIELD(tsmexaminetuple); + COMPARE_SCALAR_FIELD(tsmend); + COMPARE_SCALAR_FIELD(tsmreset); + COMPARE_SCALAR_FIELD(tsmcost); + COMPARE_NODE_FIELD(repeatable); + COMPARE_NODE_FIELD(args); + + return true; +} + +static bool _equalXmlSerialize(const XmlSerialize *a, const XmlSerialize *b) { COMPARE_SCALAR_FIELD(xmloption); @@ -3236,6 +3267,12 @@ equal(const void *a, const void *b) case T_CommonTableExpr: retval = _equalCommonTableExpr(a, b); break; + case T_RangeTableSample: + retval = _equalRangeTableSample(a, b); + break; + case T_TableSampleClause: + retval = _equalTableSampleClause(a, b); + break; case T_FuncWithArgs: retval = _equalFuncWithArgs(a, b); break; diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c index eac02159236..42d62d32d93 100644 --- a/src/backend/nodes/nodeFuncs.c +++ b/src/backend/nodes/nodeFuncs.c @@ -2058,6 +2058,14 @@ range_table_walker(List *rtable, switch (rte->rtekind) { case RTE_RELATION: + if (rte->tablesample) + { + if (walker(rte->tablesample->args, context)) + return true; + if (walker(rte->tablesample->repeatable, context)) + return true; + } + break; case RTE_CTE: /* nothing to do */ break; @@ -2813,6 +2821,14 @@ range_table_mutator(List *rtable, switch (rte->rtekind) { case RTE_RELATION: + if (rte->tablesample) + { + MUTATE(rte->tablesample->args, rte->tablesample->args, + List *); + MUTATE(rte->tablesample->repeatable, + rte->tablesample->repeatable, Node *); + } + break; case RTE_CTE: /* we don't bother to copy eref, aliases, etc; OK? */ break; @@ -3309,6 +3325,18 @@ raw_expression_tree_walker(Node *node, break; case T_CommonTableExpr: return walker(((CommonTableExpr *) node)->ctequery, context); + case T_RangeTableSample: + { + RangeTableSample *rts = (RangeTableSample *) node; + + if (walker(rts->relation, context)) + return true; + if (walker(rts->repeatable, context)) + return true; + if (walker(rts->args, context)) + return true; + } + break; default: elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node)); diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index fe868b889d9..7918553da0a 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -592,6 +592,14 @@ _outCustomScan(StringInfo str, const CustomScan *node) } static void +_outSampleScan(StringInfo str, const SampleScan *node) +{ + WRITE_NODE_TYPE("SAMPLESCAN"); + + _outScanInfo(str, (const Scan *) node); +} + +static void _outJoin(StringInfo str, const Join *node) { WRITE_NODE_TYPE("JOIN"); @@ -2445,6 +2453,36 @@ _outCommonTableExpr(StringInfo str, const CommonTableExpr *node) } static void +_outRangeTableSample(StringInfo str, const RangeTableSample *node) +{ + WRITE_NODE_TYPE("RANGETABLESAMPLE"); + + WRITE_NODE_FIELD(relation); + WRITE_STRING_FIELD(method); + WRITE_NODE_FIELD(repeatable); + WRITE_NODE_FIELD(args); +} + +static void +_outTableSampleClause(StringInfo str, const TableSampleClause *node) +{ + WRITE_NODE_TYPE("TABLESAMPLECLAUSE"); + + WRITE_OID_FIELD(tsmid); + WRITE_BOOL_FIELD(tsmseqscan); + WRITE_BOOL_FIELD(tsmpagemode); + WRITE_OID_FIELD(tsminit); + WRITE_OID_FIELD(tsmnextblock); + WRITE_OID_FIELD(tsmnexttuple); + WRITE_OID_FIELD(tsmexaminetuple); + WRITE_OID_FIELD(tsmend); + WRITE_OID_FIELD(tsmreset); + WRITE_OID_FIELD(tsmcost); + WRITE_NODE_FIELD(repeatable); + WRITE_NODE_FIELD(args); +} + +static void _outSetOperationStmt(StringInfo str, const SetOperationStmt *node) { WRITE_NODE_TYPE("SETOPERATIONSTMT"); @@ -2474,6 +2512,7 @@ _outRangeTblEntry(StringInfo str, const RangeTblEntry *node) case RTE_RELATION: WRITE_OID_FIELD(relid); WRITE_CHAR_FIELD(relkind); + WRITE_NODE_FIELD(tablesample); break; case RTE_SUBQUERY: WRITE_NODE_FIELD(subquery); @@ -2973,6 +3012,9 @@ _outNode(StringInfo str, const void *obj) case T_CustomScan: _outCustomScan(str, obj); break; + case T_SampleScan: + _outSampleScan(str, obj); + break; case T_Join: _outJoin(str, obj); break; @@ -3319,6 +3361,12 @@ _outNode(StringInfo str, const void *obj) case T_CommonTableExpr: _outCommonTableExpr(str, obj); break; + case T_RangeTableSample: + _outRangeTableSample(str, obj); + break; + case T_TableSampleClause: + _outTableSampleClause(str, obj); + break; case T_SetOperationStmt: _outSetOperationStmt(str, obj); break; diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 8136306e1e5..c8fb894a75a 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -352,6 +352,46 @@ _readCommonTableExpr(void) } /* + * _readRangeTableSample + */ +static RangeTableSample * +_readRangeTableSample(void) +{ + READ_LOCALS(RangeTableSample); + + READ_NODE_FIELD(relation); + READ_STRING_FIELD(method); + READ_NODE_FIELD(repeatable); + READ_NODE_FIELD(args); + + READ_DONE(); +} + +/* + * _readTableSampleClause + */ +static TableSampleClause * +_readTableSampleClause(void) +{ + READ_LOCALS(TableSampleClause); + + READ_OID_FIELD(tsmid); + READ_BOOL_FIELD(tsmseqscan); + READ_BOOL_FIELD(tsmpagemode); + READ_OID_FIELD(tsminit); + READ_OID_FIELD(tsmnextblock); + READ_OID_FIELD(tsmnexttuple); + READ_OID_FIELD(tsmexaminetuple); + READ_OID_FIELD(tsmend); + READ_OID_FIELD(tsmreset); + READ_OID_FIELD(tsmcost); + READ_NODE_FIELD(repeatable); + READ_NODE_FIELD(args); + + READ_DONE(); +} + +/* * _readSetOperationStmt */ static SetOperationStmt * @@ -1255,6 +1295,7 @@ _readRangeTblEntry(void) case RTE_RELATION: READ_OID_FIELD(relid); READ_CHAR_FIELD(relkind); + READ_NODE_FIELD(tablesample); break; case RTE_SUBQUERY: READ_NODE_FIELD(subquery); @@ -1351,6 +1392,10 @@ parseNodeString(void) return_value = _readRowMarkClause(); else if (MATCH("COMMONTABLEEXPR", 15)) return_value = _readCommonTableExpr(); + else if (MATCH("RANGETABLESAMPLE", 16)) + return_value = _readRangeTableSample(); + else if (MATCH("TABLESAMPLECLAUSE", 17)) + return_value = _readTableSampleClause(); else if (MATCH("SETOPERATIONSTMT", 16)) return_value = _readSetOperationStmt(); else if (MATCH("ALIAS", 5)) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 9caca94f64b..4cd1bf65e74 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -71,6 +71,10 @@ static void set_plain_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); +static void set_tablesample_rel_size(PlannerInfo *root, RelOptInfo *rel, + RangeTblEntry *rte); +static void set_tablesample_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, + RangeTblEntry *rte); static void set_foreign_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); static void set_foreign_pathlist(PlannerInfo *root, RelOptInfo *rel, @@ -265,6 +269,11 @@ set_rel_size(PlannerInfo *root, RelOptInfo *rel, /* Foreign table */ set_foreign_size(root, rel, rte); } + else if (rte->tablesample != NULL) + { + /* Sampled relation */ + set_tablesample_rel_size(root, rel, rte); + } else { /* Plain relation */ @@ -332,6 +341,11 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, /* Foreign table */ set_foreign_pathlist(root, rel, rte); } + else if (rte->tablesample != NULL) + { + /* Build sample scan on relation */ + set_tablesample_rel_pathlist(root, rel, rte); + } else { /* Plain relation */ @@ -418,6 +432,41 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) } /* + * set_tablesample_rel_size + * Set size estimates for a sampled relation. + */ +static void +set_tablesample_rel_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) +{ + /* Mark rel with estimated output rows, width, etc */ + set_baserel_size_estimates(root, rel); +} + +/* + * set_tablesample_rel_pathlist + * Build access paths for a sampled relation + * + * There is only one possible path - sampling scan + */ +static void +set_tablesample_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) +{ + Relids required_outer; + Path *path; + + /* + * We don't support pushing join clauses into the quals of a seqscan, but + * it could still have required parameterization due to LATERAL refs in + * its tlist. + */ + required_outer = rel->lateral_relids; + + /* We only do sample scan if it was requested */ + path = create_samplescan_path(root, rel, required_outer); + rel->pathlist = list_make1(path); +} + +/* * set_foreign_size * Set size estimates for a foreign table RTE */ diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 1a0d358c5f7..c2b2b7622a6 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -220,6 +220,73 @@ cost_seqscan(Path *path, PlannerInfo *root, } /* + * cost_samplescan + * Determines and returns the cost of scanning a relation using sampling. + * + * From planner/optimizer perspective, we don't care all that much about cost + * itself since there is always only one scan path to consider when sampling + * scan is present, but number of rows estimation is still important. + * + * 'baserel' is the relation to be scanned + * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL + */ +void +cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel) +{ + Cost startup_cost = 0; + Cost run_cost = 0; + double spc_seq_page_cost, + spc_random_page_cost, + spc_page_cost; + QualCost qpqual_cost; + Cost cpu_per_tuple; + BlockNumber pages; + double tuples; + RangeTblEntry *rte = planner_rt_fetch(baserel->relid, root); + TableSampleClause *tablesample = rte->tablesample; + + /* Should only be applied to base relations */ + Assert(baserel->relid > 0); + Assert(baserel->rtekind == RTE_RELATION); + + /* Mark the path with the correct row estimate */ + if (path->param_info) + path->rows = path->param_info->ppi_rows; + else + path->rows = baserel->rows; + + /* Call the sampling method's costing function. */ + OidFunctionCall6(tablesample->tsmcost, PointerGetDatum(root), + PointerGetDatum(path), PointerGetDatum(baserel), + PointerGetDatum(tablesample->args), + PointerGetDatum(&pages), PointerGetDatum(&tuples)); + + /* fetch estimated page cost for tablespace containing table */ + get_tablespace_page_costs(baserel->reltablespace, + &spc_random_page_cost, + &spc_seq_page_cost); + + + spc_page_cost = tablesample->tsmseqscan ? spc_seq_page_cost : + spc_random_page_cost; + + /* + * disk costs + */ + run_cost += spc_page_cost * pages; + + /* CPU costs */ + get_restriction_qual_cost(root, baserel, path->param_info, &qpqual_cost); + + startup_cost += qpqual_cost.startup; + cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; + run_cost += cpu_per_tuple * tuples; + + path->startup_cost = startup_cost; + path->total_cost = startup_cost + run_cost; +} + +/* * cost_index * Determines and returns the cost of scanning a relation using an index. * diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 783e34b4fbc..c6095167e80 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -59,6 +59,8 @@ static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path); static SeqScan *create_seqscan_plan(PlannerInfo *root, Path *best_path, List *tlist, List *scan_clauses); +static SampleScan *create_samplescan_plan(PlannerInfo *root, Path *best_path, + List *tlist, List *scan_clauses); static Scan *create_indexscan_plan(PlannerInfo *root, IndexPath *best_path, List *tlist, List *scan_clauses, bool indexonly); static BitmapHeapScan *create_bitmap_scan_plan(PlannerInfo *root, @@ -101,6 +103,7 @@ static List *order_qual_clauses(PlannerInfo *root, List *clauses); static void copy_path_costsize(Plan *dest, Path *src); static void copy_plan_costsize(Plan *dest, Plan *src); static SeqScan *make_seqscan(List *qptlist, List *qpqual, Index scanrelid); +static SampleScan *make_samplescan(List *qptlist, List *qpqual, Index scanrelid); static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig, List *indexorderby, List *indexorderbyorig, Oid *indexorderbyops, @@ -229,6 +232,7 @@ create_plan_recurse(PlannerInfo *root, Path *best_path) switch (best_path->pathtype) { case T_SeqScan: + case T_SampleScan: case T_IndexScan: case T_IndexOnlyScan: case T_BitmapHeapScan: @@ -344,6 +348,13 @@ create_scan_plan(PlannerInfo *root, Path *best_path) scan_clauses); break; + case T_SampleScan: + plan = (Plan *) create_samplescan_plan(root, + best_path, + tlist, + scan_clauses); + break; + case T_IndexScan: plan = (Plan *) create_indexscan_plan(root, (IndexPath *) best_path, @@ -547,6 +558,7 @@ disuse_physical_tlist(PlannerInfo *root, Plan *plan, Path *path) switch (path->pathtype) { case T_SeqScan: + case T_SampleScan: case T_IndexScan: case T_IndexOnlyScan: case T_BitmapHeapScan: @@ -1134,6 +1146,45 @@ create_seqscan_plan(PlannerInfo *root, Path *best_path, } /* + * create_samplescan_plan + * Returns a samplecan plan for the base relation scanned by 'best_path' + * with restriction clauses 'scan_clauses' and targetlist 'tlist'. + */ +static SampleScan * +create_samplescan_plan(PlannerInfo *root, Path *best_path, + List *tlist, List *scan_clauses) +{ + SampleScan *scan_plan; + Index scan_relid = best_path->parent->relid; + + /* it should be a base rel with tablesample clause... */ + Assert(scan_relid > 0); + Assert(best_path->parent->rtekind == RTE_RELATION); + Assert(best_path->pathtype == T_SampleScan); + + /* Sort clauses into best execution order */ + scan_clauses = order_qual_clauses(root, scan_clauses); + + /* Reduce RestrictInfo list to bare expressions; ignore pseudoconstants */ + scan_clauses = extract_actual_clauses(scan_clauses, false); + + /* Replace any outer-relation variables with nestloop params */ + if (best_path->param_info) + { + scan_clauses = (List *) + replace_nestloop_params(root, (Node *) scan_clauses); + } + + scan_plan = make_samplescan(tlist, + scan_clauses, + scan_relid); + + copy_path_costsize(&scan_plan->plan, best_path); + + return scan_plan; +} + +/* * create_indexscan_plan * Returns an indexscan plan for the base relation scanned by 'best_path' * with restriction clauses 'scan_clauses' and targetlist 'tlist'. @@ -3378,6 +3429,24 @@ make_seqscan(List *qptlist, return node; } +static SampleScan * +make_samplescan(List *qptlist, + List *qpqual, + Index scanrelid) +{ + SampleScan *node = makeNode(SampleScan); + Plan *plan = &node->plan; + + /* cost should be inserted by caller */ + plan->targetlist = qptlist; + plan->qual = qpqual; + plan->lefttree = NULL; + plan->righttree = NULL; + node->scanrelid = scanrelid; + + return node; +} + static IndexScan * make_indexscan(List *qptlist, List *qpqual, diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 8de57c8e6bb..9ba10516bb2 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -60,6 +60,7 @@ planner_hook_type planner_hook = NULL; #define EXPRKIND_LIMIT 6 #define EXPRKIND_APPINFO 7 #define EXPRKIND_PHV 8 +#define EXPRKIND_TABLESAMPLE 9 /* Passthrough data for standard_qp_callback */ typedef struct @@ -486,7 +487,19 @@ subquery_planner(PlannerGlobal *glob, Query *parse, RangeTblEntry *rte = (RangeTblEntry *) lfirst(l); int kind; - if (rte->rtekind == RTE_SUBQUERY) + if (rte->rtekind == RTE_RELATION) + { + if (rte->tablesample) + { + rte->tablesample->args = (List *) + preprocess_expression(root, (Node *) rte->tablesample->args, + EXPRKIND_TABLESAMPLE); + rte->tablesample->repeatable = (Node *) + preprocess_expression(root, rte->tablesample->repeatable, + EXPRKIND_TABLESAMPLE); + } + } + else if (rte->rtekind == RTE_SUBQUERY) { /* * We don't want to do all preprocessing yet on the subquery's diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index 517409d28a0..73b19888369 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -451,6 +451,17 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) fix_scan_list(root, splan->plan.qual, rtoffset); } break; + case T_SampleScan: + { + SampleScan *splan = (SampleScan *) plan; + + splan->scanrelid += rtoffset; + splan->plan.targetlist = + fix_scan_list(root, splan->plan.targetlist, rtoffset); + splan->plan.qual = + fix_scan_list(root, splan->plan.qual, rtoffset); + } + break; case T_IndexScan: { IndexScan *splan = (IndexScan *) plan; diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index afccee53acf..2f7f5c0df0e 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -2167,6 +2167,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params, break; case T_SeqScan: + case T_SampleScan: context.paramids = bms_add_members(context.paramids, scan_params); break; diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index faca30b322d..ea7a47bdf45 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -706,6 +706,26 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer) } /* + * create_samplescan_path + * Like seqscan but uses sampling function while scanning. + */ +Path * +create_samplescan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer) +{ + Path *pathnode = makeNode(Path); + + pathnode->pathtype = T_SampleScan; + pathnode->parent = rel; + pathnode->param_info = get_baserel_parampathinfo(root, rel, + required_outer); + pathnode->pathkeys = NIL; /* samplescan has unordered result */ + + cost_samplescan(pathnode, root, rel); + + return pathnode; +} + +/* * create_index_path * Creates a path node for an index scan. * @@ -1778,6 +1798,8 @@ reparameterize_path(PlannerInfo *root, Path *path, case T_SubqueryScan: return create_subqueryscan_path(root, rel, path->pathkeys, required_outer); + case T_SampleScan: + return (Path *) create_samplescan_path(root, rel, required_outer); default: break; } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 2dce87879e7..14397830686 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -454,6 +454,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type <range> relation_expr %type <range> relation_expr_opt_alias %type <target> target_el single_set_clause set_target insert_column_item +%type <node> relation_expr_tablesample tablesample_clause opt_repeatable_clause %type <str> generic_option_name %type <node> generic_option_arg @@ -625,8 +626,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); STATEMENT STATISTICS STDIN STDOUT STORAGE STRICT_P STRIP_P SUBSTRING SYMMETRIC SYSID SYSTEM_P - TABLE TABLES TABLESPACE TEMP TEMPLATE TEMPORARY TEXT_P THEN TIME TIMESTAMP - TO TRAILING TRANSACTION TRANSFORM TREAT TRIGGER TRIM TRUE_P + TABLE TABLES TABLESAMPLE TABLESPACE TEMP TEMPLATE TEMPORARY TEXT_P THEN + TIME TIMESTAMP TO TRAILING TRANSACTION TRANSFORM TREAT TRIGGER TRIM TRUE_P TRUNCATE TRUSTED TYPE_P TYPES_P UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED @@ -10386,6 +10387,10 @@ table_ref: relation_expr opt_alias_clause $1->alias = $2; $$ = (Node *) $1; } + | relation_expr_tablesample + { + $$ = (Node *) $1; + } | func_table func_alias_clause { RangeFunction *n = (RangeFunction *) $1; @@ -10711,6 +10716,32 @@ relation_expr_opt_alias: relation_expr %prec UMINUS } ; + +relation_expr_tablesample: relation_expr opt_alias_clause tablesample_clause + { + RangeTableSample *n = (RangeTableSample *) $3; + n->relation = $1; + n->relation->alias = $2; + $$ = (Node *) n; + } + ; + +tablesample_clause: + TABLESAMPLE ColId '(' expr_list ')' opt_repeatable_clause + { + RangeTableSample *n = makeNode(RangeTableSample); + n->method = $2; + n->args = $4; + n->repeatable = $6; + $$ = (Node *) n; + } + ; + +opt_repeatable_clause: + REPEATABLE '(' a_expr ')' { $$ = (Node *) $3; } + | /*EMPTY*/ { $$ = NULL; } + ; + /* * func_table represents a function invocation in a FROM list. It can be * a plain function call, like "foo(...)", or a ROWS FROM expression with @@ -13804,6 +13835,7 @@ type_func_name_keyword: | OVERLAPS | RIGHT | SIMILAR + | TABLESAMPLE | VERBOSE ; diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index 73c505ed85b..6b1bbe57d0e 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -17,6 +17,7 @@ #include "access/heapam.h" #include "catalog/catalog.h" +#include "access/htup_details.h" #include "catalog/heap.h" #include "catalog/pg_constraint.h" #include "catalog/pg_type.h" @@ -31,6 +32,7 @@ #include "parser/parse_coerce.h" #include "parser/parse_collate.h" #include "parser/parse_expr.h" +#include "parser/parse_func.h" #include "parser/parse_oper.h" #include "parser/parse_relation.h" #include "parser/parse_target.h" @@ -39,6 +41,7 @@ #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/syscache.h" /* Convenience macro for the most common makeNamespaceItem() case */ @@ -419,6 +422,39 @@ transformJoinOnClause(ParseState *pstate, JoinExpr *j, List *namespace) return result; } +static RangeTblEntry * +transformTableSampleEntry(ParseState *pstate, RangeTableSample *rv) +{ + RangeTblEntry *rte = NULL; + CommonTableExpr *cte = NULL; + TableSampleClause *tablesample = NULL; + + /* if relation has an unqualified name, it might be a CTE reference */ + if (!rv->relation->schemaname) + { + Index levelsup; + cte = scanNameSpaceForCTE(pstate, rv->relation->relname, &levelsup); + } + + /* We first need to build a range table entry */ + if (!cte) + rte = transformTableEntry(pstate, rv->relation); + + if (!rte || + (rte->relkind != RELKIND_RELATION && + rte->relkind != RELKIND_MATVIEW)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("TABLESAMPLE clause can only be used on tables and materialized views"), + parser_errposition(pstate, rv->relation->location))); + + tablesample = ParseTableSample(pstate, rv->method, rv->repeatable, + rv->args, rv->relation->location); + rte->tablesample = tablesample; + + return rte; +} + /* * transformTableEntry --- transform a RangeVar (simple relation reference) */ @@ -1127,6 +1163,26 @@ transformFromClauseItem(ParseState *pstate, Node *n, return (Node *) j; } + else if (IsA(n, RangeTableSample)) + { + /* Tablesample reference */ + RangeTableSample *rv = (RangeTableSample *) n; + RangeTblRef *rtr; + RangeTblEntry *rte = NULL; + int rtindex; + + rte = transformTableSampleEntry(pstate, rv); + + /* assume new rte is at end */ + rtindex = list_length(pstate->p_rtable); + Assert(rte == rt_fetch(rtindex, pstate->p_rtable)); + *top_rte = rte; + *top_rti = rtindex; + *namespace = list_make1(makeDefaultNSItem(rte)); + rtr = makeNode(RangeTblRef); + rtr->rtindex = rtindex; + return (Node *) rtr; + } else elog(ERROR, "unrecognized node type: %d", (int) nodeTag(n)); return NULL; /* can't get here, keep compiler quiet */ diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index f7affebf84e..fa50f92d8dd 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -18,6 +18,7 @@ #include "catalog/pg_aggregate.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" +#include "catalog/pg_tablesample_method.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "nodes/makefuncs.h" @@ -26,6 +27,7 @@ #include "parser/parse_clause.h" #include "parser/parse_coerce.h" #include "parser/parse_func.h" +#include "parser/parse_expr.h" #include "parser/parse_relation.h" #include "parser/parse_target.h" #include "parser/parse_type.h" @@ -767,6 +769,147 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs, } +/* + * ParseTableSample + * + * Parse TABLESAMPLE clause and process the arguments + */ +TableSampleClause * +ParseTableSample(ParseState *pstate, char *samplemethod, Node *repeatable, + List *sampleargs, int location) +{ + HeapTuple tuple; + Form_pg_tablesample_method tsm; + Form_pg_proc procform; + TableSampleClause *tablesample; + List *fargs; + ListCell *larg; + int nargs, initnargs; + Oid init_arg_types[FUNC_MAX_ARGS]; + + /* Load the tablesample method */ + tuple = SearchSysCache1(TABLESAMPLEMETHODNAME, PointerGetDatum(samplemethod)); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("tablesample method \"%s\" does not exist", + samplemethod), + parser_errposition(pstate, location))); + + tablesample = makeNode(TableSampleClause); + tablesample->tsmid = HeapTupleGetOid(tuple); + + tsm = (Form_pg_tablesample_method) GETSTRUCT(tuple); + + tablesample->tsmseqscan = tsm->tsmseqscan; + tablesample->tsmpagemode = tsm->tsmpagemode; + tablesample->tsminit = tsm->tsminit; + tablesample->tsmnextblock = tsm->tsmnextblock; + tablesample->tsmnexttuple = tsm->tsmnexttuple; + tablesample->tsmexaminetuple = tsm->tsmexaminetuple; + tablesample->tsmend = tsm->tsmend; + tablesample->tsmreset = tsm->tsmreset; + tablesample->tsmcost = tsm->tsmcost; + + ReleaseSysCache(tuple); + + /* Validate the parameters against init function definition. */ + tuple = SearchSysCache1(PROCOID, + ObjectIdGetDatum(tablesample->tsminit)); + + if (!HeapTupleIsValid(tuple)) /* should not happen */ + elog(ERROR, "cache lookup failed for function %u", + tablesample->tsminit); + + procform = (Form_pg_proc) GETSTRUCT(tuple); + initnargs = procform->pronargs; + Assert(initnargs >= 3); + + /* + * First parameter is used to pass the SampleScanState, second is + * seed (REPEATABLE), skip the processing for them here, just assert + * that the types are correct. + */ + Assert(procform->proargtypes.values[0] == INTERNALOID); + Assert(procform->proargtypes.values[1] == INT4OID); + initnargs -= 2; + memcpy(init_arg_types, procform->proargtypes.values + 2, + initnargs * sizeof(Oid)); + + /* Now we are done with the catalog */ + ReleaseSysCache(tuple); + + /* Process repeatable (seed) */ + if (repeatable != NULL) + { + Node *arg = repeatable; + + if (arg && IsA(arg, A_Const)) + { + A_Const *con = (A_Const *) arg; + + if (con->val.type == T_Null) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("REPEATABLE clause must be NOT NULL numeric value"), + parser_errposition(pstate, con->location))); + + } + + arg = transformExpr(pstate, arg, EXPR_KIND_FROM_FUNCTION); + arg = coerce_to_specific_type(pstate, arg, INT4OID, "REPEATABLE"); + tablesample->repeatable = arg; + } + else + tablesample->repeatable = NULL; + + /* Check user provided expected number of arguments. */ + if (list_length(sampleargs) != initnargs) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_ARGUMENTS), + errmsg_plural("tablesample method \"%s\" expects %d argument got %d", + "tablesample method \"%s\" expects %d arguments got %d", + initnargs, + samplemethod, + initnargs, list_length(sampleargs)), + parser_errposition(pstate, location))); + + /* Transform the arguments, typecasting them as needed. */ + fargs = NIL; + nargs = 0; + foreach(larg, sampleargs) + { + Node *inarg = (Node *) lfirst(larg); + Node *arg = transformExpr(pstate, inarg, EXPR_KIND_FROM_FUNCTION); + Oid argtype = exprType(arg); + + if (argtype != init_arg_types[nargs]) + { + if (!can_coerce_type(1, &argtype, &init_arg_types[nargs], + COERCION_IMPLICIT)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("wrong parameter %d for tablesample method \"%s\"", + nargs + 1, samplemethod), + errdetail("Expected type %s got %s.", + format_type_be(init_arg_types[nargs]), + format_type_be(argtype)), + parser_errposition(pstate, exprLocation(inarg)))); + + arg = coerce_type(pstate, arg, argtype, init_arg_types[nargs], -1, + COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, -1); + } + + fargs = lappend(fargs, arg); + nargs++; + } + + /* Pass the arguments down */ + tablesample->args = fargs; + + return tablesample; +} + /* func_match_argtypes() * * Given a list of candidate functions (having the right name and number diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c index 39302a410b8..e27afd1a3e0 100644 --- a/src/backend/rewrite/rewriteHandler.c +++ b/src/backend/rewrite/rewriteHandler.c @@ -2209,6 +2209,9 @@ view_query_is_auto_updatable(Query *viewquery, bool check_cols) base_rte->relkind != RELKIND_VIEW)) return gettext_noop("Views that do not select from a single table or view are not automatically updatable."); + if (base_rte->tablesample) + return gettext_noop("Views containing TABLESAMPLE are not automatically updatable."); + /* * Check that the view has at least one updatable column. This is required * for INSERT/UPDATE but not for DELETE. diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 903e80aea35..298eebf5e67 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -32,6 +32,7 @@ #include "catalog/pg_opclass.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" +#include "catalog/pg_tablesample_method.h" #include "catalog/pg_trigger.h" #include "catalog/pg_type.h" #include "commands/defrem.h" @@ -345,6 +346,8 @@ static void make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, int prettyFlags); static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, int prettyFlags, int wrapColumn); +static void get_tablesample_def(TableSampleClause *tablesample, + deparse_context *context); static void get_query_def(Query *query, StringInfo buf, List *parentnamespace, TupleDesc resultDesc, int prettyFlags, int wrapColumn, int startIndent); @@ -4220,6 +4223,50 @@ make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc, heap_close(ev_relation, AccessShareLock); } +/* ---------- + * get_tablesample_def - Convert TableSampleClause back to SQL + * ---------- + */ +static void +get_tablesample_def(TableSampleClause *tablesample, deparse_context *context) +{ + StringInfo buf = context->buf; + HeapTuple tuple; + Form_pg_tablesample_method tsm; + char *tsmname; + int nargs; + ListCell *l; + + /* Load the tablesample method */ + tuple = SearchSysCache1(TABLESAMPLEMETHODOID, ObjectIdGetDatum(tablesample->tsmid)); + if (!HeapTupleIsValid(tuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("cache lookup failed for tablesample method %u", + tablesample->tsmid))); + + tsm = (Form_pg_tablesample_method) GETSTRUCT(tuple); + tsmname = NameStr(tsm->tsmname); + appendStringInfo(buf, " TABLESAMPLE %s (", quote_identifier(tsmname)); + + ReleaseSysCache(tuple); + + nargs = 0; + foreach(l, tablesample->args) + { + if (nargs++ > 0) + appendStringInfoString(buf, ", "); + get_rule_expr((Node *) lfirst(l), context, true); + } + appendStringInfoChar(buf, ')'); + + if (tablesample->repeatable != NULL) + { + appendStringInfoString(buf, " REPEATABLE ("); + get_rule_expr(tablesample->repeatable, context, true); + appendStringInfoChar(buf, ')'); + } +} /* ---------- * get_query_def - Parse back one query parsetree @@ -8529,6 +8576,9 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) only_marker(rte), generate_relation_name(rte->relid, context->namespaces)); + + if (rte->tablesample) + get_tablesample_def(rte->tablesample, context); break; case RTE_SUBQUERY: /* Subquery RTE */ diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 1dc293297d9..f259751e157 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -32,6 +32,7 @@ #include "catalog/pg_range.h" #include "catalog/pg_statistic.h" #include "catalog/pg_transform.h" +#include "catalog/pg_tablesample_method.h" #include "catalog/pg_type.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -2996,3 +2997,29 @@ get_range_subtype(Oid rangeOid) else return InvalidOid; } + +/* ---------- PG_TABLESAMPLE_METHOD CACHE ---------- */ + +/* + * get_tablesample_method_name - given a tablesample method OID, + * look up the name or NULL if not found + */ +char * +get_tablesample_method_name(Oid tsmid) +{ + HeapTuple tuple; + + tuple = SearchSysCache1(TABLESAMPLEMETHODOID, ObjectIdGetDatum(tsmid)); + if (HeapTupleIsValid(tuple)) + { + Form_pg_tablesample_method tup = + (Form_pg_tablesample_method) GETSTRUCT(tuple); + char *result; + + result = pstrdup(NameStr(tup->tsmname)); + ReleaseSysCache(tuple); + return result; + } + else + return NULL; +} diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index f58e1cebf2a..7def1be32ae 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -56,6 +56,7 @@ #include "catalog/pg_shseclabel.h" #include "catalog/pg_replication_origin.h" #include "catalog/pg_statistic.h" +#include "catalog/pg_tablesample_method.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_transform.h" #include "catalog/pg_ts_config.h" @@ -666,6 +667,28 @@ static const struct cachedesc cacheinfo[] = { }, 128 }, + {TableSampleMethodRelationId, /* TABLESAMPLEMETHODNAME */ + TableSampleMethodNameIndexId, + 1, + { + Anum_pg_tablesample_method_tsmname, + 0, + 0, + 0, + }, + 2 + }, + {TableSampleMethodRelationId, /* TABLESAMPLEMETHODOID */ + TableSampleMethodOidIndexId, + 1, + { + ObjectIdAttributeNumber, + 0, + 0, + 0, + }, + 2 + }, {TableSpaceRelationId, /* TABLESPACEOID */ TablespaceOidIndexId, 1, diff --git a/src/backend/utils/misc/sampling.c b/src/backend/utils/misc/sampling.c index 1eeabaf1588..9becc63bf82 100644 --- a/src/backend/utils/misc/sampling.c +++ b/src/backend/utils/misc/sampling.c @@ -46,6 +46,8 @@ BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize, bs->n = samplesize; bs->t = 0; /* blocks scanned so far */ bs->m = 0; /* blocks selected so far */ + + sampler_random_init_state(randseed, bs->randstate); } bool @@ -92,7 +94,7 @@ BlockSampler_Next(BlockSampler bs) * less than k, which means that we cannot fail to select enough blocks. *---------- */ - V = sampler_random_fract(); + V = sampler_random_fract(bs->randstate); p = 1.0 - (double) k / (double) K; while (V < p) { @@ -126,8 +128,14 @@ BlockSampler_Next(BlockSampler bs) void reservoir_init_selection_state(ReservoirState rs, int n) { + /* + * Reservoir sampling is not used anywhere where it would need to return + * repeatable results so we can initialize it randomly. + */ + sampler_random_init_state(random(), rs->randstate); + /* Initial value of W (for use when Algorithm Z is first applied) */ - *rs = exp(-log(sampler_random_fract()) / n); + rs->W = exp(-log(sampler_random_fract(rs->randstate)) / n); } double @@ -142,7 +150,7 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) double V, quot; - V = sampler_random_fract(); /* Generate V */ + V = sampler_random_fract(rs->randstate); /* Generate V */ S = 0; t += 1; /* Note: "num" in Vitter's code is always equal to t - n */ @@ -158,7 +166,7 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) else { /* Now apply Algorithm Z */ - double W = *rs; + double W = rs->W; double term = t - (double) n + 1; for (;;) @@ -174,7 +182,7 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) tmp; /* Generate U and X */ - U = sampler_random_fract(); + U = sampler_random_fract(rs->randstate); X = t * (W - 1.0); S = floor(X); /* S is tentatively set to floor(X) */ /* Test if U <= h(S)/cg(X) in the manner of (6.3) */ @@ -203,11 +211,11 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) y *= numer / denom; denom -= 1; } - W = exp(-log(sampler_random_fract()) / n); /* Generate W in advance */ + W = exp(-log(sampler_random_fract(rs->randstate)) / n); /* Generate W in advance */ if (exp(log(y) / n) <= (t + X) / t) break; } - *rs = W; + rs->W = W; } return S; } @@ -217,10 +225,17 @@ reservoir_get_next_S(ReservoirState rs, double t, int n) * Random number generator used by sampling *---------- */ +void +sampler_random_init_state(long seed, SamplerRandomState randstate) +{ + randstate[0] = RAND48_SEED_0; + randstate[1] = (unsigned short) seed; + randstate[2] = (unsigned short) (seed >> 16); +} /* Select a random value R uniformly distributed in (0 - 1) */ double -sampler_random_fract() +sampler_random_fract(SamplerRandomState randstate) { - return ((double) random() + 1) / ((double) MAX_RANDOM_VALUE + 2); + return pg_erand48(randstate); } |