Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 244407a

Browse files
committed
Fix efficiency problems in tuplestore_trim().
The original coding in tuplestore_trim() was only meant to work efficiently in cases where each trim call deleted most of the tuples in the store. Which, in fact, was the pattern of the original usage with a Material node supporting mark/restore operations underneath a MergeJoin. However, WindowAgg now uses tuplestores and it has considerably less friendly trimming behavior. In particular it can attempt to trim one tuple at a time off a large tuplestore. tuplestore_trim() had O(N^2) runtime in this situation because of repeatedly shifting its tuple pointer array. Fix by avoiding shifting the array until a reasonably large number of tuples have been deleted. This can waste some pointer space, but we do still reclaim the tuples themselves, so the percentage wastage should be pretty small. Per Jie Li's report of slow percent_rank() evaluation. cume_dist() and ntile() would certainly be affected as well, along with any other window function that has a moving frame start and requires reading substantially ahead of the current row. Back-patch to 8.4, where window functions were introduced. There's no need to tweak it before that.
1 parent 663fc32 commit 244407a

File tree

1 file changed

+36
-16
lines changed

1 file changed

+36
-16
lines changed

src/backend/utils/sort/tuplestore.c

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -145,8 +145,15 @@ struct Tuplestorestate
145145
/*
146146
* This array holds pointers to tuples in memory if we are in state INMEM.
147147
* In states WRITEFILE and READFILE it's not used.
148+
*
149+
* When memtupdeleted > 0, the first memtupdeleted pointers are already
150+
* released due to a tuplestore_trim() operation, but we haven't expended
151+
* the effort to slide the remaining pointers down. These unused pointers
152+
* are set to NULL to catch any invalid accesses. Note that memtupcount
153+
* includes the deleted pointers.
148154
*/
149155
void **memtuples; /* array of pointers to palloc'd tuples */
156+
int memtupdeleted; /* the first N slots are currently unused */
150157
int memtupcount; /* number of tuples currently present */
151158
int memtupsize; /* allocated length of memtuples array */
152159

@@ -252,6 +259,7 @@ tuplestore_begin_common(int eflags, bool interXact, int maxKBytes)
252259
state->context = CurrentMemoryContext;
253260
state->resowner = CurrentResourceOwner;
254261

262+
state->memtupdeleted = 0;
255263
state->memtupcount = 0;
256264
state->memtupsize = 1024; /* initial guess */
257265
state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *));
@@ -401,14 +409,15 @@ tuplestore_clear(Tuplestorestate *state)
401409
state->myfile = NULL;
402410
if (state->memtuples)
403411
{
404-
for (i = 0; i < state->memtupcount; i++)
412+
for (i = state->memtupdeleted; i < state->memtupcount; i++)
405413
{
406414
FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i]));
407415
pfree(state->memtuples[i]);
408416
}
409417
}
410418
state->status = TSS_INMEM;
411419
state->truncated = false;
420+
state->memtupdeleted = 0;
412421
state->memtupcount = 0;
413422
readptr = state->readptrs;
414423
for (i = 0; i < state->readptrcount; readptr++, i++)
@@ -432,7 +441,7 @@ tuplestore_end(Tuplestorestate *state)
432441
BufFileClose(state->myfile);
433442
if (state->memtuples)
434443
{
435-
for (i = 0; i < state->memtupcount; i++)
444+
for (i = state->memtupdeleted; i < state->memtupcount; i++)
436445
pfree(state->memtuples[i]);
437446
pfree(state->memtuples);
438447
}
@@ -774,14 +783,14 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
774783
}
775784
else
776785
{
777-
if (readptr->current <= 0)
786+
if (readptr->current <= state->memtupdeleted)
778787
{
779788
Assert(!state->truncated);
780789
return NULL;
781790
}
782791
readptr->current--; /* last returned tuple */
783792
}
784-
if (readptr->current <= 0)
793+
if (readptr->current <= state->memtupdeleted)
785794
{
786795
Assert(!state->truncated);
787796
return NULL;
@@ -969,7 +978,7 @@ dumptuples(Tuplestorestate *state)
969978
{
970979
int i;
971980

972-
for (i = 0;; i++)
981+
for (i = state->memtupdeleted;; i++)
973982
{
974983
TSReadPointer *readptr = state->readptrs;
975984
int j;
@@ -984,6 +993,7 @@ dumptuples(Tuplestorestate *state)
984993
break;
985994
WRITETUP(state, state->memtuples[i]);
986995
}
996+
state->memtupdeleted = 0;
987997
state->memtupcount = 0;
988998
}
989999

@@ -1153,40 +1163,50 @@ tuplestore_trim(Tuplestorestate *state)
11531163
nremove = oldest - 1;
11541164
if (nremove <= 0)
11551165
return; /* nothing to do */
1166+
1167+
Assert(nremove >= state->memtupdeleted);
11561168
Assert(nremove <= state->memtupcount);
11571169

11581170
/* Release no-longer-needed tuples */
1159-
for (i = 0; i < nremove; i++)
1171+
for (i = state->memtupdeleted; i < nremove; i++)
11601172
{
11611173
FREEMEM(state, GetMemoryChunkSpace(state->memtuples[i]));
11621174
pfree(state->memtuples[i]);
1175+
state->memtuples[i] = NULL;
11631176
}
1177+
state->memtupdeleted = nremove;
1178+
1179+
/* mark tuplestore as truncated (used for Assert crosschecks only) */
1180+
state->truncated = true;
1181+
1182+
/*
1183+
* If nremove is less than 1/8th memtupcount, just stop here, leaving the
1184+
* "deleted" slots as NULL. This prevents us from expending O(N^2) time
1185+
* repeatedly memmove-ing a large pointer array. The worst case space
1186+
* wastage is pretty small, since it's just pointers and not whole tuples.
1187+
*/
1188+
if (nremove < state->memtupcount / 8)
1189+
return;
11641190

11651191
/*
1166-
* Slide the array down and readjust pointers. This may look pretty
1167-
* stupid, but we expect that there will usually not be very many
1168-
* tuple-pointers to move, so this isn't that expensive; and it keeps a
1169-
* lot of other logic simple.
1192+
* Slide the array down and readjust pointers.
11701193
*
1171-
* In fact, in the current usage for merge joins, it's demonstrable that
1172-
* there will always be exactly one non-removed tuple; so optimize that
1173-
* case.
1194+
* In mergejoin's current usage, it's demonstrable that there will always
1195+
* be exactly one non-removed tuple; so optimize that case.
11741196
*/
11751197
if (nremove + 1 == state->memtupcount)
11761198
state->memtuples[0] = state->memtuples[nremove];
11771199
else
11781200
memmove(state->memtuples, state->memtuples + nremove,
11791201
(state->memtupcount - nremove) * sizeof(void *));
11801202

1203+
state->memtupdeleted = 0;
11811204
state->memtupcount -= nremove;
11821205
for (i = 0; i < state->readptrcount; i++)
11831206
{
11841207
if (!state->readptrs[i].eof_reached)
11851208
state->readptrs[i].current -= nremove;
11861209
}
1187-
1188-
/* mark tuplestore as truncated (used for Assert crosschecks only) */
1189-
state->truncated = true;
11901210
}
11911211

11921212
/*

0 commit comments

Comments
 (0)