Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 92ee252

Browse files
committed
Tweak processing of multiple-index-scan plans to reduce overhead when
handling many-way scans: instead of re-evaluating all prior indexscan quals to see if a tuple has been fetched more than once, use a hash table indexed by tuple CTID. But fall back to the old way if the hash table grows to exceed SortMem.
1 parent 38e2bf6 commit 92ee252

File tree

2 files changed

+153
-31
lines changed

2 files changed

+153
-31
lines changed

src/backend/executor/nodeIndexscan.c

+148-30
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.82 2003/08/04 02:39:59 momjian Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.83 2003/08/22 20:26:43 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -28,19 +28,51 @@
2828
#include "access/heapam.h"
2929
#include "executor/execdebug.h"
3030
#include "executor/nodeIndexscan.h"
31+
#include "miscadmin.h"
3132
#include "nodes/nodeFuncs.h"
3233
#include "optimizer/clauses.h"
3334
#include "parser/parsetree.h"
3435

35-
/* ----------------
36-
* Misc stuff to move to executor.h soon -cim 6/5/90
37-
* ----------------
38-
*/
36+
3937
#define NO_OP 0
4038
#define LEFT_OP 1
4139
#define RIGHT_OP 2
4240

41+
/*
42+
* In a multiple-index plan, we must take care to return any given tuple
43+
* only once, even if it matches conditions of several index scans. Our
44+
* preferred way to do this is to record already-returned tuples in a hash
45+
* table (using the TID as unique identifier). However, in a very large
46+
* scan this could conceivably run out of memory. We limit the hash table
47+
* to no more than SortMem KB; if it grows past that, we fall back to the
48+
* pre-7.4 technique: evaluate the prior-scan index quals again for each
49+
* tuple (which is space-efficient, but slow).
50+
*
51+
* When scanning backwards, we use scannum to determine when to emit the
52+
* tuple --- we have to re-emit a tuple in the same scan as it was first
53+
* encountered.
54+
*
55+
* Note: this code would break if the planner were ever to create a multiple
56+
* index plan with overall backwards direction, because the hashtable code
57+
* will emit a tuple the first time it is encountered (which would be the
58+
* highest scan in which it matches the index), but the evaluate-the-quals
59+
* code will emit a tuple in the lowest-numbered scan in which it's valid.
60+
* This could be fixed at need by making the evaluate-the-quals case more
61+
* complex. Currently the planner will never create such a plan (since it
62+
* considers multi-index plans unordered anyway), so there's no need for
63+
* more complexity.
64+
*/
65+
typedef struct
66+
{
67+
/* tid is the hash key and so must be first! */
68+
ItemPointerData tid; /* TID of a tuple we've returned */
69+
int scannum; /* number of scan we returned it in */
70+
} DupHashTabEntry;
71+
72+
4373
static TupleTableSlot *IndexNext(IndexScanState *node);
74+
static void create_duphash(IndexScanState *node);
75+
4476

4577
/* ----------------------------------------------------------------
4678
* IndexNext
@@ -163,7 +195,7 @@ IndexNext(IndexScanState *node)
163195
while ((tuple = index_getnext(scandesc, direction)) != NULL)
164196
{
165197
/*
166-
* store the scanned tuple in the scan tuple slot of the scan
198+
* Store the scanned tuple in the scan tuple slot of the scan
167199
* state. Note: we pass 'false' because tuples returned by
168200
* amgetnext are pointers onto disk pages and must not be
169201
* pfree()'d.
@@ -174,36 +206,80 @@ IndexNext(IndexScanState *node)
174206
false); /* don't pfree */
175207

176208
/*
177-
* We must check to see if the current tuple was already
178-
* matched by an earlier index, so we don't double-report it.
179-
* We do this by passing the tuple through ExecQual and
180-
* checking for failure with all previous qualifications.
209+
* If it's a multiple-index scan, make sure not to double-report
210+
* a tuple matched by more than one index. (See notes above.)
181211
*/
182-
if (node->iss_IndexPtr > 0)
212+
if (numIndices > 1)
183213
{
184-
bool prev_matches = false;
185-
int prev_index;
186-
List *qual;
187-
188-
econtext->ecxt_scantuple = slot;
189-
ResetExprContext(econtext);
190-
qual = node->indxqualorig;
191-
for (prev_index = 0;
192-
prev_index < node->iss_IndexPtr;
193-
prev_index++)
214+
/* First try the hash table */
215+
if (node->iss_DupHash)
194216
{
195-
if (ExecQual((List *) lfirst(qual), econtext, false))
217+
DupHashTabEntry *entry;
218+
bool found;
219+
220+
entry = (DupHashTabEntry *)
221+
hash_search(node->iss_DupHash,
222+
&tuple->t_data->t_ctid,
223+
HASH_ENTER,
224+
&found);
225+
if (entry == NULL ||
226+
node->iss_DupHash->hctl->nentries > node->iss_MaxHash)
227+
{
228+
/* out of memory (either hard or soft limit) */
229+
/* release hash table and fall thru to old code */
230+
hash_destroy(node->iss_DupHash);
231+
node->iss_DupHash = NULL;
232+
}
233+
else if (found)
196234
{
197-
prev_matches = true;
198-
break;
235+
/* pre-existing entry */
236+
237+
/*
238+
* It's duplicate if first emitted in a different
239+
* scan. If same scan, we must be backing up, so
240+
* okay to emit again.
241+
*/
242+
if (entry->scannum != node->iss_IndexPtr)
243+
{
244+
/* Dup, so drop it and loop back for another */
245+
ExecClearTuple(slot);
246+
continue;
247+
}
248+
}
249+
else
250+
{
251+
/* new entry, finish filling it in */
252+
entry->scannum = node->iss_IndexPtr;
199253
}
200-
qual = lnext(qual);
201254
}
202-
if (prev_matches)
255+
/* If hash table has overflowed, do it the hard way */
256+
if (node->iss_DupHash == NULL &&
257+
node->iss_IndexPtr > 0)
203258
{
204-
/* Duplicate, so drop it and loop back for another */
205-
ExecClearTuple(slot);
206-
continue;
259+
bool prev_matches = false;
260+
int prev_index;
261+
List *qual;
262+
263+
econtext->ecxt_scantuple = slot;
264+
ResetExprContext(econtext);
265+
qual = node->indxqualorig;
266+
for (prev_index = 0;
267+
prev_index < node->iss_IndexPtr;
268+
prev_index++)
269+
{
270+
if (ExecQual((List *) lfirst(qual), econtext, false))
271+
{
272+
prev_matches = true;
273+
break;
274+
}
275+
qual = lnext(qual);
276+
}
277+
if (prev_matches)
278+
{
279+
/* Dup, so drop it and loop back for another */
280+
ExecClearTuple(slot);
281+
continue;
282+
}
207283
}
208284
}
209285

@@ -383,6 +459,14 @@ ExecIndexReScan(IndexScanState *node, ExprContext *exprCtxt)
383459
return;
384460
}
385461

462+
/* reset hash table */
463+
if (numIndices > 1)
464+
{
465+
if (node->iss_DupHash)
466+
hash_destroy(node->iss_DupHash);
467+
create_duphash(node);
468+
}
469+
386470
/* reset index scans */
387471
if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indxorderdir))
388472
node->iss_IndexPtr = numIndices;
@@ -432,6 +516,10 @@ ExecEndIndexScan(IndexScanState *node)
432516
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
433517
ExecClearTuple(node->ss.ss_ScanTupleSlot);
434518

519+
/* drop hash table */
520+
if (node->iss_DupHash)
521+
hash_destroy(node->iss_DupHash);
522+
435523
/*
436524
* close the index relations
437525
*/
@@ -507,7 +595,7 @@ ExecIndexRestrPos(IndexScanState *node)
507595

508596
/* ----------------------------------------------------------------
509597
* ExecInitIndexScan
510-
*
598+
*
511599
* Initializes the index scan's state information, creates
512600
* scan keys, and opens the base and index relations.
513601
*
@@ -919,12 +1007,42 @@ ExecInitIndexScan(IndexScan *node, EState *estate)
9191007
ExecAssignResultTypeFromTL(&indexstate->ss.ps);
9201008
ExecAssignScanProjectionInfo(&indexstate->ss);
9211009

1010+
/*
1011+
* Initialize hash table if needed.
1012+
*/
1013+
if (numIndices > 1)
1014+
create_duphash(indexstate);
1015+
else
1016+
indexstate->iss_DupHash = NULL;
1017+
9221018
/*
9231019
* all done.
9241020
*/
9251021
return indexstate;
9261022
}
9271023

1024+
static void
1025+
create_duphash(IndexScanState *node)
1026+
{
1027+
HASHCTL hash_ctl;
1028+
1029+
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
1030+
hash_ctl.keysize = SizeOfIptrData;
1031+
hash_ctl.entrysize = sizeof(DupHashTabEntry);
1032+
hash_ctl.hash = tag_hash;
1033+
hash_ctl.hcxt = CurrentMemoryContext;
1034+
node->iss_DupHash = hash_create("DupHashTable",
1035+
(long) ceil(node->ss.ps.plan->plan_rows),
1036+
&hash_ctl,
1037+
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
1038+
if (node->iss_DupHash == NULL)
1039+
ereport(ERROR,
1040+
(errcode(ERRCODE_OUT_OF_MEMORY),
1041+
errmsg("out of memory")));
1042+
node->iss_MaxHash = (SortMem * 1024L) /
1043+
(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(DupHashTabEntry)));
1044+
}
1045+
9281046
int
9291047
ExecCountSlotsIndexScan(IndexScan *node)
9301048
{

src/include/nodes/execnodes.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $Id: execnodes.h,v 1.104 2003/08/19 01:13:41 tgl Exp $
10+
* $Id: execnodes.h,v 1.105 2003/08/22 20:26:43 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -768,6 +768,8 @@ typedef ScanState SeqScanState;
768768
* RuntimeKeysReady true if runtime Skeys have been computed
769769
* RelationDescs ptr to array of relation descriptors
770770
* ScanDescs ptr to array of scan descriptors
771+
* DupHash hashtable for recognizing dups in multiple scan
772+
* MaxHash max # entries we will allow in hashtable
771773
* ----------------
772774
*/
773775
typedef struct IndexScanState
@@ -785,6 +787,8 @@ typedef struct IndexScanState
785787
bool iss_RuntimeKeysReady;
786788
RelationPtr iss_RelationDescs;
787789
IndexScanDescPtr iss_ScanDescs;
790+
HTAB *iss_DupHash;
791+
long iss_MaxHash;
788792
} IndexScanState;
789793

790794
/* ----------------

0 commit comments

Comments
 (0)