diff options
author | Robert Haas | 2015-01-19 20:20:31 +0000 |
---|---|---|
committer | Robert Haas | 2015-01-19 20:28:27 +0000 |
commit | 4ea51cdfe85ceef8afabceb03c446574daa0ac23 (patch) | |
tree | 703433dfca58c84f8b947b6f4f562ecdf98f1669 /src/backend | |
parent | 1605291b6c14be92915948d17f5509191632c97f (diff) |
Use abbreviated keys for faster sorting of text datums.
This commit extends the SortSupport infrastructure to allow operator
classes the option to provide abbreviated representations of Datums;
in the case of text, we abbreviate by taking the first few characters
of the strxfrm() blob. If the abbreviated comparison is insufficent
to resolve the comparison, we fall back on the normal comparator.
This can be much faster than the old way of doing sorting if the
first few bytes of the string are usually sufficient to resolve the
comparison.
There is the potential for a performance regression if all of the
strings to be sorted are identical for the first 8+ characters and
differ only in later positions; therefore, the SortSupport machinery
now provides an infrastructure to abort the use of abbreviation if
it appears that abbreviation is producing comparatively few distinct
keys. HyperLogLog, a streaming cardinality estimator, is included in
this commit and used to make that determination for text.
Peter Geoghegan, reviewed by me.
Diffstat (limited to 'src/backend')
-rw-r--r-- | src/backend/access/nbtree/nbtsort.c | 2 | ||||
-rw-r--r-- | src/backend/commands/analyze.c | 6 | ||||
-rw-r--r-- | src/backend/executor/nodeAgg.c | 4 | ||||
-rw-r--r-- | src/backend/executor/nodeMergeAppend.c | 9 | ||||
-rw-r--r-- | src/backend/executor/nodeMergejoin.c | 8 | ||||
-rw-r--r-- | src/backend/lib/Makefile | 2 | ||||
-rw-r--r-- | src/backend/lib/hyperloglog.c | 228 | ||||
-rw-r--r-- | src/backend/utils/adt/orderedsetaggs.c | 8 | ||||
-rw-r--r-- | src/backend/utils/adt/varlena.c | 330 | ||||
-rw-r--r-- | src/backend/utils/sort/tuplesort.c | 413 |
10 files changed, 943 insertions, 67 deletions
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index ce8b674e411..625f490af80 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -717,6 +717,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; + /* Abbreviation is not supported here */ + sortKey->abbreviate = false; AssertState(sortKey->ssup_attno != 0); diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 5de2b39d103..d2856a379e7 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -2301,6 +2301,12 @@ compute_scalar_stats(VacAttrStatsP stats, /* We always use the default collation for statistics */ ssup.ssup_collation = DEFAULT_COLLATION_OID; ssup.ssup_nulls_first = false; + /* + * For now, don't perform abbreviated key conversion, because full values + * are required for MCV slot generation. Supporting that optimization + * would necessitate teaching compare_scalars() to call a tie-breaker. + */ + ssup.abbreviate = false; PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup); diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 08088ea3c0c..8079d977643 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -363,6 +363,10 @@ initialize_aggregates(AggState *aggstate, * We use a plain Datum sorter when there's a single input column; * otherwise sort the full tuple. (See comments for * process_ordered_aggregate_single.) + * + * In the future, we should consider forcing the + * tuplesort_begin_heap() case when the abbreviated key + * optimization can thereby be used, even when numInputs is 1. */ peraggstate->sortstate = (peraggstate->numInputs == 1) ? diff --git a/src/backend/executor/nodeMergeAppend.c b/src/backend/executor/nodeMergeAppend.c index 4e200a8e346..0c814f0e72d 100644 --- a/src/backend/executor/nodeMergeAppend.c +++ b/src/backend/executor/nodeMergeAppend.c @@ -137,6 +137,15 @@ ExecInitMergeAppend(MergeAppend *node, EState *estate, int eflags) sortKey->ssup_nulls_first = node->nullsFirst[i]; sortKey->ssup_attno = node->sortColIdx[i]; + /* + * It isn't feasible to perform abbreviated key conversion, since + * tuples are pulled into mergestate's binary heap as needed. It would + * likely be counter-productive to convert tuples into an abbreviated + * representation as they're pulled up, so opt out of that additional + * optimization entirely. + */ + sortKey->abbreviate = false; + PrepareSortSupportFromOrderingOp(node->sortOperators[i], sortKey); } diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c index 2a5a7acf945..15742c574ad 100644 --- a/src/backend/executor/nodeMergejoin.c +++ b/src/backend/executor/nodeMergejoin.c @@ -229,6 +229,14 @@ MJExamineQuals(List *mergeclauses, elog(ERROR, "cannot merge using non-equality operator %u", qual->opno); + /* + * sortsupport routine must know if abbreviation optimization is + * applicable in principle. It is never applicable for merge joins + * because there is no convenient opportunity to convert to alternative + * representation. + */ + clause->ssup.abbreviate = false; + /* And get the matching support or comparison function */ Assert(clause->ssup.comparator == NULL); sortfunc = get_opfamily_proc(opfamily, diff --git a/src/backend/lib/Makefile b/src/backend/lib/Makefile index 949ee5e41d0..fe4781a8e88 100644 --- a/src/backend/lib/Makefile +++ b/src/backend/lib/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/lib top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = ilist.o binaryheap.o pairingheap.o rbtree.o stringinfo.o +OBJS = ilist.o binaryheap.o hyperloglog.o pairingheap.o rbtree.o stringinfo.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c new file mode 100644 index 00000000000..1157e9ad763 --- /dev/null +++ b/src/backend/lib/hyperloglog.c @@ -0,0 +1,228 @@ +/*------------------------------------------------------------------------- + * + * hyperloglog.c + * HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014, PostgreSQL Global Development Group + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "postgres.h" + +#include <math.h> + +#include "lib/hyperloglog.h" + +#define POW_2_32 (4294967296.0) +#define NEG_POW_2_32 (-4294967296.0) + +static inline uint8 rho(uint32 x, uint8 b); + +/* + * Initialize HyperLogLog track state + * + * bwidth is bit width (so register size will be 2 to the power of bwidth). + * Must be between 4 and 16 inclusive. + */ +void +initHyperLogLog(hyperLogLogState *cState, uint8 bwidth) +{ + double alpha; + + if (bwidth < 4 || bwidth > 16) + elog(ERROR, "bit width must be between 4 and 16 inclusive"); + + cState->registerWidth = bwidth; + cState->nRegisters = 1 << bwidth; + cState->arrSize = sizeof(uint8) * cState->nRegisters + 1; + + /* + * Initialize hashes array to zero, not negative infinity, per discussion + * of the coupon collector problem in the HyperLogLog paper + */ + cState->hashesArr = palloc0(cState->arrSize); + + /* + * "alpha" is a value that for each possible number of registers (m) is + * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z + * is "the indicator function" through which we finally compute E, + * estimated cardinality). + */ + switch (cState->nRegisters) + { + case 16: + alpha = 0.673; + break; + case 32: + alpha = 0.697; + break; + case 64: + alpha = 0.709; + break; + default: + alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters); + } + + /* + * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog + * estimate E + */ + cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters; +} + +/* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ +void +addHyperLogLog(hyperLogLogState *cState, uint32 hash) +{ + uint8 count; + uint32 index; + + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth); + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << cState->registerWidth, + BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth); + + cState->hashesArr[index] = Max(count, cState->hashesArr[index]); +} + +/* + * Estimates cardinality, based on elements added so far + */ +double +estimateHyperLogLog(hyperLogLogState *cState) +{ + double result; + double sum = 0.0; + int i; + + for (i = 0; i < cState->nRegisters; i++) + { + sum += 1.0 / pow(2.0, cState->hashesArr[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = cState->alphaMM / sum; + + if (result <= (5.0 / 2.0) * cState->nRegisters) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < cState->nRegisters; i++) + { + if (cState->hashesArr[i] == 0) + zero_count++; + } + + if (zero_count != 0) + result = cState->nRegisters * log((double) cState->nRegisters / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; +} + +/* + * Merges the estimate from one HyperLogLog state to another, returning the + * estimate of their union. + * + * The number of registers in each must match. + */ +void +mergeHyperLogLog(hyperLogLogState *cState, const hyperLogLogState *oState) +{ + int r; + + if (cState->nRegisters != oState->nRegisters) + elog(ERROR, "number of registers mismatch: %zu != %zu", + cState->nRegisters, oState->nRegisters); + + for (r = 0; r < cState->nRegisters; ++r) + { + cState->hashesArr[r] = Max(cState->hashesArr[r], oState->hashesArr[r]); + } +} + + +/* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ +static inline uint8 +rho(uint32 x, uint8 b) +{ + uint8 j = 1; + + while (j <= b && !(x & 0x80000000)) + { + j++; + x <<= 1; + } + + return j; +} diff --git a/src/backend/utils/adt/orderedsetaggs.c b/src/backend/utils/adt/orderedsetaggs.c index 869a83b185a..f9a5f7f93fa 100644 --- a/src/backend/utils/adt/orderedsetaggs.c +++ b/src/backend/utils/adt/orderedsetaggs.c @@ -266,7 +266,13 @@ ordered_set_startup(FunctionCallInfo fcinfo, bool use_tuples) osastate->qstate = qstate; osastate->gcontext = gcontext; - /* Initialize tuplesort object */ + /* + * Initialize tuplesort object. + * + * In the future, we should consider forcing the tuplesort_begin_heap() + * case when the abbreviated key optimization can thereby be used, even + * when !use_tuples. + */ if (use_tuples) osastate->sortstate = tuplesort_begin_heap(qstate->tupdesc, qstate->numSortCols, diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index e95ed88366f..71d47380ac6 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -17,9 +17,11 @@ #include <ctype.h> #include <limits.h> +#include "access/hash.h" #include "access/tuptoaster.h" #include "catalog/pg_collation.h" #include "catalog/pg_type.h" +#include "lib/hyperloglog.h" #include "libpq/md5.h" #include "libpq/pqformat.h" #include "miscadmin.h" @@ -32,6 +34,9 @@ #include "utils/pg_locale.h" #include "utils/sortsupport.h" +#ifdef DEBUG_ABBREV_KEYS +#define DEBUG_elog_output DEBUG1 +#endif /* GUC variable */ int bytea_output = BYTEA_OUTPUT_HEX; @@ -54,10 +59,12 @@ typedef struct typedef struct { - char *buf1; /* 1st string */ - char *buf2; /* 2nd string */ + char *buf1; /* 1st string, or abbreviation original string buf */ + char *buf2; /* 2nd string, or abbreviation strxfrm() buf */ int buflen1; int buflen2; + hyperLogLogState abbr_card; /* Abbreviated key cardinality state */ + hyperLogLogState full_card; /* Full key cardinality state */ #ifdef HAVE_LOCALE_T pg_locale_t locale; #endif @@ -78,6 +85,9 @@ typedef struct static void btsortsupport_worker(SortSupport ssup, Oid collid); static int bttextfastcmp_c(Datum x, Datum y, SortSupport ssup); static int bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup); +static int bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup); +static Datum bttext_abbrev_convert(Datum original, SortSupport ssup); +static bool bttext_abbrev_abort(int memtupcount, SortSupport ssup); static int32 text_length(Datum str); static text *text_catenate(text *t1, text *t2); static text *text_substring(Datum str, @@ -1736,26 +1746,53 @@ btsortsupport_worker(SortSupport ssup, Oid collid) TextSortSupport *tss; /* - * If LC_COLLATE = C, we can make things quite a bit faster by using - * memcmp() rather than strcoll(). To minimize the per-comparison - * overhead, we make this decision just once for the whole sort. - */ - if (lc_collate_is_c(collid)) - { - ssup->comparator = bttextfastcmp_c; - return; - } - - /* * WIN32 requires complex hacks when the database encoding is UTF-8 (except * when using the "C" collation). For now, we don't optimize that case. */ #ifdef WIN32 - if (GetDatabaseEncoding() == PG_UTF8) + if (GetDatabaseEncoding() == PG_UTF8 && !lc_collate_is_c(collid)) return; #endif /* + * On platforms where the abbreviated key for text optimization might have + * bad worst case performance, it may be useful to avoid it entirely by + * disabling it at compile time. Having only 4 byte datums could make + * worst-case performance drastically more likely, for example. Moreover, + * Darwin's strxfrm() implementations is known to not effectively + * concentrate a significant amount of entropy from the original string in + * earlier transformed blobs. It's possible that other supported platforms + * are similarly encumbered. + * + * Any reasonable implementation will pack primary weights into the start + * of returned blobs. The canonical algorithm's implementation is + * discussed by Unicode Technical Standard #10 ("UNICODE COLLATION + * ALGORITHM"), section 4, "Main algorithm". Section 4.3, "Form Sort Key" + * is of particular interest: + * + * http://www.unicode.org/reports/tr10/#Step_3 + * + * The collation algorithm standard goes on to state: + * + * "By default, the algorithm makes use of three fully-customizable levels. + * For the Latin script, these levels correspond roughly to: + * + * alphabetic ordering + * + * diacritic ordering + * + * case ordering. + * + * A final level may be used for tie-breaking between strings not otherwise + * distinguished." + * + * It is generally expected that most non-equal keys will have their + * comparisons resolved at the primary level. If enough comparisons can be + * resolved with just 4 or 8 byte abbreviated keys, this optimization is + * very effective (although if there are many tie-breakers that largely + * only perform cheap memcmp() calls, that is also much faster than the + * unoptimized case - see bttext_abbrev_abort()). + * * We may need a collation-sensitive comparison. To make things faster, * we'll figure out the collation based on the locale id and cache the * result. Also, since strxfrm()/strcoll() require NUL-terminated inputs, @@ -1788,13 +1825,47 @@ btsortsupport_worker(SortSupport ssup, Oid collid) #endif } - tss->buf1 = palloc(TEXTBUFLEN); - tss->buflen1 = TEXTBUFLEN; - tss->buf2 = palloc(TEXTBUFLEN); - tss->buflen2 = TEXTBUFLEN; + /* + * If LC_COLLATE = C, we can make things quite a bit faster by using + * memcmp() rather than strcoll(). To minimize the per-comparison + * overhead, we make this decision just once for the whole sort. + * + * There is no reason to not at least perform fmgr elision on builds where + * abbreviation is disabled. + */ + if (lc_collate_is_c(collid)) + ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_c; + else + ssup->abbrev_full_comparator = ssup->comparator = bttextfastcmp_locale; + + if (!lc_collate_is_c(collid) || ssup->abbreviate) + { + /* + * Abbreviated case requires temp buffers for strxfrm() copying. + * bttextfastcmp_locale() also uses these buffers (even if abbreviation + * isn't used), while bttextfast_c() does not. + */ + tss->buf1 = palloc(TEXTBUFLEN); + tss->buflen1 = TEXTBUFLEN; + tss->buf2 = palloc(TEXTBUFLEN); + tss->buflen2 = TEXTBUFLEN; + ssup->ssup_extra = tss; + } + + if (!ssup->abbreviate) + return; - ssup->ssup_extra = tss; - ssup->comparator = bttextfastcmp_locale; + initHyperLogLog(&tss->abbr_card, 10); + initHyperLogLog(&tss->full_card, 10); + + /* + * Change comparator to be abbreviation-based -- abbreviated version will + * probably ultimately be used during sorting proper, but core code may + * switch back to authoritative comparator should abbreviation be aborted + */ + ssup->comparator = bttextcmp_abbrev; + ssup->abbrev_converter = bttext_abbrev_convert; + ssup->abbrev_abort = bttext_abbrev_abort; } /* @@ -1903,6 +1974,225 @@ done: return result; } +/* + * Abbreviated key comparison func + */ +static int +bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup) +{ + char *a = (char *) &x; + char *b = (char *) &y; + int result; + + result = memcmp(a, b, sizeof(Datum)); + + /* + * When result = 0, the core system will call bttextfastcmp_c() or + * bttextfastcmp_locale(). Even a strcmp() on two non-truncated strxfrm() + * blobs cannot indicate *equality* authoritatively, for the same reason + * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp(). + */ + return result; +} + +/* + * Conversion routine for sortsupport. Converts original text to abbreviated + * key representation. Our encoding strategy is simple -- pack the first 8 + * bytes of a strxfrm() blob into a Datum. + */ +static Datum +bttext_abbrev_convert(Datum original, SortSupport ssup) +{ + TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra; + text *authoritative = DatumGetTextPP(original); + + /* working state */ + Datum res; + char *pres; + int len; + Size bsize; + uint32 hash; + + /* + * Abbreviated key representation is a pass-by-value Datum that is treated + * as a char array by the specialized comparator bttextcmp_abbrev(). + */ + pres = (char *) &res; + /* memset(), so any non-overwritten bytes are NUL */ + memset(pres, 0, sizeof(Datum)); + len = VARSIZE_ANY_EXHDR(authoritative); + + /* By convention, we use buffer 1 to store and NUL-terminate text */ + if (len >= tss->buflen1) + { + pfree(tss->buf1); + tss->buflen1 = Max(len + 1, Min(tss->buflen1 * 2, MaxAllocSize)); + tss->buf1 = palloc(tss->buflen1); + } + + /* Just like strcoll(), strxfrm() expects a NUL-terminated string */ + memcpy(tss->buf1, VARDATA_ANY(authoritative), len); + tss->buf1[len] = '\0'; + + /* Don't leak memory here */ + if (PointerGetDatum(authoritative) != original) + pfree(authoritative); + +retry: + + /* + * There is no special handling of the C locale here, unlike with + * varstr_cmp(). strxfrm() is used indifferently. + */ +#ifdef HAVE_LOCALE_T + if (tss->locale) + bsize = strxfrm_l(tss->buf2, tss->buf1, tss->buflen2, tss->locale); + else +#endif + bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2); + + if (bsize >= tss->buflen2) + { + /* + * The C standard states that the contents of the buffer is now + * unspecified. Grow buffer, and retry. + */ + pfree(tss->buf2); + tss->buflen2 = Max(bsize + 1, Min(tss->buflen2 * 2, MaxAllocSize)); + tss->buf2 = palloc(tss->buflen2); + goto retry; + } + + /* + * Maintain approximate cardinality of both abbreviated keys and original, + * authoritative keys using HyperLogLog. Used as cheap insurance against + * the worst case, where we do many string transformations for no saving in + * full strcoll()-based comparisons. These statistics are used by + * bttext_abbrev_abort(). + * + * First, Hash key proper, or a significant fraction of it. Mix in length + * in order to compensate for cases where differences are past + * CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing. + */ + hash = hash_any((unsigned char *) tss->buf1, Min(len, PG_CACHE_LINE_SIZE)); + + if (len > PG_CACHE_LINE_SIZE) + hash ^= DatumGetUInt32(hash_uint32((uint32) len)); + + addHyperLogLog(&tss->full_card, hash); + + memcpy(pres, tss->buf2, Min(sizeof(Datum), bsize)); + + /* Hash abbreviated key */ +#if SIZEOF_DATUM == 8 + { + uint32 lohalf, + hihalf; + + lohalf = (uint32) res; + hihalf = (uint32) (res >> 32); + hash = hash_uint32(lohalf ^ hihalf); + } +#else /* SIZEOF_DATUM != 8 */ + hash = hash_uint32((uint32) res); +#endif + + addHyperLogLog(&tss->abbr_card, hash); + + /* + * Every Datum byte is always compared. This is safe because the strxfrm() + * blob is itself NUL terminated, leaving no danger of misinterpreting any + * NUL bytes not intended to be interpreted as logically representing + * termination. + */ + return res; +} + +/* + * Callback for estimating effectiveness of abbreviated key optimization, using + * heuristic rules. Returns value indicating if the abbreviation optimization + * should be aborted, based on its projected effectiveness. + */ +static bool +bttext_abbrev_abort(int memtupcount, SortSupport ssup) +{ + TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra; + double abbrev_distinct, key_distinct; + + Assert(ssup->abbreviate); + + /* Have a little patience */ + if (memtupcount < 20) + return false; + + abbrev_distinct = estimateHyperLogLog(&tss->abbr_card); + key_distinct = estimateHyperLogLog(&tss->full_card); + + /* + * Clamp cardinality estimates to at least one distinct value. While NULLs + * are generally disregarded, if only NULL values were seen so far, that + * might misrepresent costs if we failed to clamp. + */ + if (abbrev_distinct <= 1.0) + abbrev_distinct = 1.0; + + if (key_distinct <= 1.0) + key_distinct = 1.0; + + /* + * In the worst case all abbreviated keys are identical, while at the same + * time there are differences within full key strings not captured in + * abbreviations. + */ +#ifdef DEBUG_ABBREV_KEYS + { + double norm_abbrev_card = abbrev_distinct / (double) memtupcount; + + elog(DEBUG_elog_output, "abbrev_distinct after %d: %f (key_distinct: %f, norm_abbrev_card: %f)", + memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card); + } +#endif + + /* + * If the number of distinct abbreviated keys approximately matches the + * number of distinct authoritative original keys, that's reason enough to + * proceed. We can win even with a very low cardinality set if most + * tie-breakers only memcmp(). This is by far the most important + * consideration. + * + * While comparisons that are resolved at the abbreviated key level are + * considerably cheaper than tie-breakers resolved with memcmp(), both of + * those two outcomes are so much cheaper than a full strcoll() once + * sorting is underway that it doesn't seem worth it to weigh abbreviated + * cardinality against the overall size of the set in order to more + * accurately model costs. Assume that an abbreviated comparison, and an + * abbreviated comparison with a cheap memcmp()-based authoritative + * resolution are equivalent. + */ + if (abbrev_distinct > key_distinct * 0.05) + return false; + + /* + * Abort abbreviation strategy. + * + * The worst case, where all abbreviated keys are identical while all + * original strings differ will typically only see a regression of about + * 10% in execution time for small to medium sized lists of strings. + * Whereas on modern CPUs where cache stalls are the dominant cost, we can + * often expect very large improvements, particularly with sets of strings + * of moderately high to high abbreviated cardinality. There is little to + * lose but much to gain, which our strategy reflects. + */ +#ifdef DEBUG_ABBREV_KEYS + elog(DEBUG_elog_output, "would have aborted abbreviation due to worst-case at %d. abbrev_distinct: %f, key_distinct: %f", + memtupcount, abbrev_distinct, key_distinct); + /* Actually abort only when debugging is disabled */ + return false; +#endif + + return true; +} + Datum text_larger(PG_FUNCTION_ARGS) { diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 4aa39728416..6d3aa889bc5 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -150,7 +150,10 @@ bool optimize_bounded_sort = true; * When sorting single Datums, the data value is represented directly by * datum1/isnull1. If the datatype is pass-by-reference and isnull1 is false, * then datum1 points to a separately palloc'd data value that is also pointed - * to by the "tuple" pointer; otherwise "tuple" is NULL. + * to by the "tuple" pointer; otherwise "tuple" is NULL. There is one special + * case: when the sort support infrastructure provides an "abbreviated key" + * representation, where the key is (typically) a pass by value proxy for a + * pass by reference type. * * While building initial runs, tupindex holds the tuple's run number. During * merge passes, we re-use it to hold the input tape number that each tuple in @@ -347,6 +350,14 @@ struct Tuplesortstate SortSupport onlyKey; /* + * Additional state for managing "abbreviated key" sortsupport routines + * (which currently may be used by all cases except the Datum sort case and + * hash index case). Tracks the intervals at which the optimization's + * effectiveness is tested. + */ + int64 abbrevNext; /* Tuple # at which to next check applicability */ + + /* * These variables are specific to the CLUSTER case; they are set by * tuplesort_begin_cluster. */ @@ -442,6 +453,7 @@ struct Tuplesortstate static Tuplesortstate *tuplesort_begin_common(int workMem, bool randomAccess); static void puttuple_common(Tuplesortstate *state, SortTuple *tuple); +static bool consider_abort_common(Tuplesortstate *state); static void inittapes(Tuplesortstate *state); static void selectnewtape(Tuplesortstate *state); static void mergeruns(Tuplesortstate *state); @@ -619,6 +631,7 @@ tuplesort_begin_heap(TupleDesc tupDesc, state->readtup = readtup_heap; state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + state->abbrevNext = 10; /* Prepare SortSupport data for each column */ state->sortKeys = (SortSupport) palloc0(nkeys * sizeof(SortSupportData)); @@ -634,11 +647,19 @@ tuplesort_begin_heap(TupleDesc tupDesc, sortKey->ssup_collation = sortCollations[i]; sortKey->ssup_nulls_first = nullsFirstFlags[i]; sortKey->ssup_attno = attNums[i]; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey); } - if (nkeys == 1) + /* + * The "onlyKey" optimization cannot be used with abbreviated keys, since + * tie-breaker comparisons may be required. Typically, the optimization is + * only of value to pass-by-value types anyway, whereas abbreviated keys + * are typically only of value to pass-by-reference types. + */ + if (nkeys == 1 && !state->sortKeys->abbrev_converter) state->onlyKey = state->sortKeys; MemoryContextSwitchTo(oldcontext); @@ -680,6 +701,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, state->copytup = copytup_cluster; state->writetup = writetup_cluster; state->readtup = readtup_cluster; + state->abbrevNext = 10; state->indexInfo = BuildIndexInfo(indexRel); @@ -719,6 +741,8 @@ tuplesort_begin_cluster(TupleDesc tupDesc, sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); AssertState(sortKey->ssup_attno != 0); @@ -768,6 +792,7 @@ tuplesort_begin_index_btree(Relation heapRel, state->copytup = copytup_index; state->writetup = writetup_index; state->readtup = readtup_index; + state->abbrevNext = 10; state->heapRel = heapRel; state->indexRel = indexRel; @@ -791,6 +816,8 @@ tuplesort_begin_index_btree(Relation heapRel, sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; + /* Convey if abbreviation optimization is applicable in principle */ + sortKey->abbreviate = (i == 0); AssertState(sortKey->ssup_attno != 0); @@ -883,6 +910,13 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation, state->onlyKey->ssup_cxt = CurrentMemoryContext; state->onlyKey->ssup_collation = sortCollation; state->onlyKey->ssup_nulls_first = nullsFirstFlag; + /* + * Conversion to abbreviated representation infeasible in the Datum case. + * It must be possible to subsequently fetch original datum values within + * tuplesort_getdatum(), which would require special-case preservation of + * original values. + */ + state->onlyKey->abbreviate = false; PrepareSortSupportFromOrderingOp(sortOperator, state->onlyKey); @@ -928,6 +962,19 @@ tuplesort_set_bound(Tuplesortstate *state, int64 bound) state->bounded = true; state->bound = (int) bound; + + /* + * Bounded sorts are not an effective target for abbreviated key + * optimization. Disable by setting state to be consistent with no + * abbreviation support. + */ + state->sortKeys->abbrev_converter = NULL; + if (state->sortKeys->abbrev_full_comparator) + state->sortKeys->comparator = state->sortKeys->abbrev_full_comparator; + + /* Not strictly necessary, but be tidy */ + state->sortKeys->abbrev_abort = NULL; + state->sortKeys->abbrev_full_comparator = NULL; } /* @@ -1186,15 +1233,63 @@ tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, { MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); SortTuple stup; + Datum original; + IndexTuple tuple; stup.tuple = index_form_tuple(RelationGetDescr(rel), values, isnull); - ((IndexTuple) stup.tuple)->t_tid = *self; + tuple = ((IndexTuple) stup.tuple); + tuple->t_tid = *self; USEMEM(state, GetMemoryChunkSpace(stup.tuple)); /* set up first-column key value */ - stup.datum1 = index_getattr((IndexTuple) stup.tuple, - 1, - RelationGetDescr(state->indexRel), - &stup.isnull1); + original = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + + if (!state->sortKeys->abbrev_converter || stup.isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to "void" representation (to be + * consistent). + */ + stup.datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup.datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup.datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just handled). + * Note that we rely on all tuples copied so far actually being + * contained within memtuples array. + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup.isnull1); + } + } + puttuple_common(state, &stup); MemoryContextSwitchTo(oldcontext); @@ -1359,6 +1454,47 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple) } } +static bool +consider_abort_common(Tuplesortstate *state) +{ + Assert(state->sortKeys[0].abbrev_converter != NULL); + Assert(state->sortKeys[0].abbrev_abort != NULL); + Assert(state->sortKeys[0].abbrev_full_comparator != NULL); + + /* + * Check effectiveness of abbreviation optimization. Consider aborting + * when still within memory limit. + */ + if (state->status == TSS_INITIAL && + state->memtupcount >= state->abbrevNext) + { + state->abbrevNext *= 2; + + /* + * Check opclass-supplied abbreviation abort routine. It may + * indicate that abbreviation should not proceed. + */ + if (!state->sortKeys->abbrev_abort(state->memtupcount, + state->sortKeys)) + return false; + + /* + * Finally, restore authoritative comparator, and indicate that + * abbreviation is not in play by setting abbrev_converter to NULL + */ + state->sortKeys[0].comparator = state->sortKeys[0].abbrev_full_comparator; + state->sortKeys[0].abbrev_converter = NULL; + /* Not strictly necessary, but be tidy */ + state->sortKeys[0].abbrev_abort = NULL; + state->sortKeys[0].abbrev_full_comparator = NULL; + + /* Give up - expect original pass-by-value representation */ + return true; + } + + return false; +} + /* * All tuples have been provided; finish the sort. */ @@ -2853,6 +2989,12 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) TupleDesc tupDesc; int nkey; int32 compare; + AttrNumber attno; + Datum datum1, + datum2; + bool isnull1, + isnull2; + /* Compare the leading sort key */ compare = ApplySortComparator(a->datum1, a->isnull1, @@ -2867,14 +3009,25 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET; rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET); tupDesc = state->tupDesc; + + if (sortKey->abbrev_converter) + { + attno = sortKey->ssup_attno; + + datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); + datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + sortKey++; for (nkey = 1; nkey < state->nKeys; nkey++, sortKey++) { - AttrNumber attno = sortKey->ssup_attno; - Datum datum1, - datum2; - bool isnull1, - isnull2; + attno = sortKey->ssup_attno; datum1 = heap_getattr(<up, attno, tupDesc, &isnull1); datum2 = heap_getattr(&rtup, attno, tupDesc, &isnull2); @@ -2897,6 +3050,7 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) * MinimalTuple using the exported interface for that. */ TupleTableSlot *slot = (TupleTableSlot *) tup; + Datum original; MinimalTuple tuple; HeapTupleData htup; @@ -2907,10 +3061,58 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup) /* set up first-column key value */ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET; htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET); - stup->datum1 = heap_getattr(&htup, - state->sortKeys[0].ssup_attno, - state->tupDesc, - &stup->isnull1); + original = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to "void" representation (to be + * consistent). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just handled). + * Note that we rely on all tuples copied so far actually being + * contained within memtuples array. + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + htup.t_len = ((MinimalTuple) mtup->tuple)->t_len + + MINIMAL_TUPLE_OFFSET; + htup.t_data = (HeapTupleHeader) ((char *) mtup->tuple - + MINIMAL_TUPLE_OFFSET); + + mtup->datum1 = heap_getattr(&htup, + state->sortKeys[0].ssup_attno, + state->tupDesc, + &mtup->isnull1); + } + } } static void @@ -2980,13 +3182,35 @@ comparetup_cluster(const SortTuple *a, const SortTuple *b, TupleDesc tupDesc; int nkey; int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + AttrNumber leading = state->indexInfo->ii_KeyAttrNumbers[0]; + + /* Be prepared to compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + tupDesc = state->tupDesc; /* Compare the leading sort key, if it's simple */ - if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) + if (leading != 0) { compare = ApplySortComparator(a->datum1, a->isnull1, b->datum1, b->isnull1, sortKey); + if (compare != 0) + return compare; + + if (sortKey->abbrev_converter) + { + datum1 = heap_getattr(ltup, leading, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, leading, tupDesc, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + } if (compare != 0 || state->nKeys == 1) return compare; /* Compare additional columns the hard way */ @@ -2999,22 +3223,13 @@ comparetup_cluster(const SortTuple *a, const SortTuple *b, nkey = 0; } - /* Compare additional sort keys */ - ltup = (HeapTuple) a->tuple; - rtup = (HeapTuple) b->tuple; - if (state->indexInfo->ii_Expressions == NULL) { /* If not expression index, just compare the proper heap attrs */ - tupDesc = state->tupDesc; for (; nkey < state->nKeys; nkey++, sortKey++) { AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey]; - Datum datum1, - datum2; - bool isnull1, - isnull2; datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); @@ -3072,17 +3287,67 @@ static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) { HeapTuple tuple = (HeapTuple) tup; + Datum original; /* copy the tuple into sort storage */ tuple = heap_copytuple(tuple); stup->tuple = (void *) tuple; USEMEM(state, GetMemoryChunkSpace(tuple)); - /* set up first-column key value, if it's a simple column */ - if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) - stup->datum1 = heap_getattr(tuple, - state->indexInfo->ii_KeyAttrNumbers[0], - state->tupDesc, - &stup->isnull1); + /* + * set up first-column key value, and potentially abbreviate, if it's a + * simple column + */ + if (state->indexInfo->ii_KeyAttrNumbers[0] == 0) + return; + + original = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to "void" representation (to be + * consistent). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just handled). + * Note that we rely on all tuples copied so far actually being + * contained within memtuples array. + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (HeapTuple) mtup->tuple; + mtup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); + } + } } static void @@ -3162,6 +3427,11 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b, bool equal_hasnull = false; int nkey; int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + /* Compare the leading sort key */ compare = ApplySortComparator(a->datum1, a->isnull1, @@ -3170,23 +3440,31 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b, if (compare != 0) return compare; - /* they are equal, so we only need to examine one null flag */ - if (a->isnull1) - equal_hasnull = true; - /* Compare additional sort keys */ tuple1 = (IndexTuple) a->tuple; tuple2 = (IndexTuple) b->tuple; keysz = state->nKeys; tupDes = RelationGetDescr(state->indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + sortKey++; for (nkey = 2; nkey <= keysz; nkey++, sortKey++) { - Datum datum1, - datum2; - bool isnull1, - isnull2; - datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); @@ -3313,6 +3591,7 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) IndexTuple tuple = (IndexTuple) tup; unsigned int tuplen = IndexTupleSize(tuple); IndexTuple newtuple; + Datum original; /* copy the tuple into sort storage */ newtuple = (IndexTuple) palloc(tuplen); @@ -3320,10 +3599,54 @@ copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup) USEMEM(state, GetMemoryChunkSpace(newtuple)); stup->tuple = (void *) newtuple; /* set up first-column key value */ - stup->datum1 = index_getattr(newtuple, - 1, - RelationGetDescr(state->indexRel), - &stup->isnull1); + original = index_getattr(newtuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + + if (!state->sortKeys->abbrev_converter || stup->isnull1) + { + /* + * Store ordinary Datum representation, or NULL value. If there is a + * converter it won't expect NULL values, and cost model is not + * required to account for NULL, so in that case we avoid calling + * converter and just set datum1 to "void" representation (to be + * consistent). + */ + stup->datum1 = original; + } + else if (!consider_abort_common(state)) + { + /* Store abbreviated key representation */ + stup->datum1 = state->sortKeys->abbrev_converter(original, + state->sortKeys); + } + else + { + /* Abort abbreviation */ + int i; + + stup->datum1 = original; + + /* + * Set state to be consistent with never trying abbreviation. + * + * Alter datum1 representation in already-copied tuples, so as to + * ensure a consistent representation (current tuple was just handled). + * Note that we rely on all tuples copied so far actually being + * contained within memtuples array. + */ + for (i = 0; i < state->memtupcount; i++) + { + SortTuple *mtup = &state->memtuples[i]; + + tuple = (IndexTuple) mtup->tuple; + mtup->datum1 = index_getattr(tuple, + 1, + RelationGetDescr(state->indexRel), + &stup->isnull1); + } + } } static void |