Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c66e4f1

Browse files
committed
Improve GiST range-contained-by searches by adding a flag for empty ranges.
In the original implementation, a range-contained-by search had to scan the entire index because an empty range could be lurking anywhere. Improve that by adding a flag to upper GiST entries that says whether the represented subtree contains any empty ranges. Also, make a simple mod to the penalty function to discourage empty ranges from getting pushed into subtrees without any. This needs more work, and the picksplit function should be taught about it too, but that code can be improved without causing an on-disk compatibility break; so we'll leave it for another day. Since we're breaking on-disk compatibility of range values anyway, I took the opportunity to reorganize the range flags bits; the unused RANGE_xB_NULL bits are now adjacent, which might open the door for using them in some other way later. In passing, remove the GiST range opclass entry for <>, which doesn't seem like it can really be indexed usefully. Alexander Korotkov, with some editorializing by Tom
1 parent 08da2d2 commit c66e4f1

File tree

6 files changed

+118
-56
lines changed

6 files changed

+118
-56
lines changed

src/backend/utils/adt/rangetypes.c

+18
Original file line numberDiff line numberDiff line change
@@ -1622,6 +1622,24 @@ range_get_flags(RangeType *range)
16221622
return *((char *) range + VARSIZE(range) - 1);
16231623
}
16241624

1625+
/*
1626+
* range_set_contain_empty: set the RANGE_CONTAIN_EMPTY bit in the value.
1627+
*
1628+
* This is only needed in GiST operations, so we don't include a provision
1629+
* for setting it in range_serialize; rather, this function must be applied
1630+
* afterwards.
1631+
*/
1632+
void
1633+
range_set_contain_empty(RangeType *range)
1634+
{
1635+
char *flagsp;
1636+
1637+
/* flag byte is datum's last byte */
1638+
flagsp = (char *) range + VARSIZE(range) - 1;
1639+
1640+
*flagsp |= RANGE_CONTAIN_EMPTY;
1641+
}
1642+
16251643
/*
16261644
* This both serializes and canonicalizes (if applicable) the range.
16271645
* This should be used by most callers.

src/backend/utils/adt/rangetypes_gist.c

+85-44
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "access/gist.h"
1818
#include "access/skey.h"
1919
#include "utils/builtins.h"
20+
#include "utils/datum.h"
2021
#include "utils/rangetypes.h"
2122

2223

@@ -32,7 +33,11 @@
3233
#define RANGESTRAT_CONTAINED_BY 8
3334
#define RANGESTRAT_CONTAINS_ELEM 16
3435
#define RANGESTRAT_EQ 18
35-
#define RANGESTRAT_NE 19
36+
37+
/* Copy a RangeType datum (hardwires typbyval and typlen for ranges...) */
38+
#define rangeCopy(r) \
39+
((RangeType *) DatumGetPointer(datumCopy(PointerGetDatum(r), \
40+
false, -1)))
3641

3742
/*
3843
* Auxiliary structure for picksplit method.
@@ -145,6 +150,16 @@ range_gist_penalty(PG_FUNCTION_ARGS)
145150

146151
subtype_diff = &typcache->rng_subdiff_finfo;
147152

153+
/*
154+
* If new is or contains empty, and orig doesn't, apply infinite penalty.
155+
* We really don't want to pollute an empty-free subtree with empties.
156+
*/
157+
if (RangeIsOrContainsEmpty(new) && !RangeIsOrContainsEmpty(orig))
158+
{
159+
*penalty = get_float4_infinity();
160+
PG_RETURN_POINTER(penalty);
161+
}
162+
148163
/*
149164
* We want to compare the size of "orig" to size of "orig union new".
150165
* The penalty will be the sum of the reduction in the lower bound plus
@@ -163,30 +178,9 @@ range_gist_penalty(PG_FUNCTION_ARGS)
163178
}
164179
else if (empty1)
165180
{
166-
if (lower2.infinite || upper2.infinite)
167-
{
168-
/* from empty to infinite */
169-
*penalty = get_float4_infinity();
170-
PG_RETURN_POINTER(penalty);
171-
}
172-
else if (OidIsValid(subtype_diff->fn_oid))
173-
{
174-
/* from empty to upper2-lower2 */
175-
*penalty = DatumGetFloat8(FunctionCall2Coll(subtype_diff,
176-
typcache->rng_collation,
177-
upper2.val,
178-
lower2.val));
179-
/* upper2 must be >= lower2 */
180-
if (*penalty < 0)
181-
*penalty = 0; /* subtype_diff is broken */
182-
PG_RETURN_POINTER(penalty);
183-
}
184-
else
185-
{
186-
/* wild guess */
187-
*penalty = 1.0;
188-
PG_RETURN_POINTER(penalty);
189-
}
181+
/* infinite penalty for pushing non-empty into all-empty subtree */
182+
*penalty = get_float4_infinity();
183+
PG_RETURN_POINTER(penalty);
190184
}
191185

192186
/* if orig isn't empty, s_union can't be either */
@@ -334,15 +328,27 @@ range_gist_picksplit(PG_FUNCTION_ARGS)
334328
Datum
335329
range_gist_same(PG_FUNCTION_ARGS)
336330
{
337-
/* Datum r1 = PG_GETARG_DATUM(0); */
338-
/* Datum r2 = PG_GETARG_DATUM(1); */
331+
RangeType *r1 = PG_GETARG_RANGE(0);
332+
RangeType *r2 = PG_GETARG_RANGE(1);
339333
bool *result = (bool *) PG_GETARG_POINTER(2);
340334

341335
/*
342-
* We can safely call range_eq using our fcinfo directly; it won't notice
343-
* the third argument. This allows it to use fn_extra for caching.
336+
* range_eq will ignore the RANGE_CONTAIN_EMPTY flag, so we have to
337+
* check that for ourselves. More generally, if the entries have been
338+
* properly normalized, then unequal flags bytes must mean unequal ranges
339+
* ... so let's just test all the flag bits at once.
344340
*/
345-
*result = DatumGetBool(range_eq(fcinfo));
341+
if (range_get_flags(r1) != range_get_flags(r2))
342+
*result = false;
343+
else
344+
{
345+
/*
346+
* We can safely call range_eq using our fcinfo directly; it won't
347+
* notice the third argument. This allows it to use fn_extra for
348+
* caching.
349+
*/
350+
*result = DatumGetBool(range_eq(fcinfo));
351+
}
346352

347353
PG_RETURN_POINTER(result);
348354
}
@@ -356,27 +362,53 @@ range_gist_same(PG_FUNCTION_ARGS)
356362
/*
357363
* Return the smallest range that contains r1 and r2
358364
*
359-
* XXX would it be better to redefine range_union as working this way?
365+
* This differs from regular range_union in two critical ways:
366+
* 1. It won't throw an error for non-adjacent r1 and r2, but just absorb
367+
* the intervening values into the result range.
368+
* 2. We track whether any empty range has been union'd into the result,
369+
* so that contained_by searches can be indexed. Note that this means
370+
* that *all* unions formed within the GiST index must go through here.
360371
*/
361372
static RangeType *
362373
range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
363374
{
375+
RangeType *result;
364376
RangeBound lower1,
365377
lower2;
366378
RangeBound upper1,
367379
upper2;
368380
bool empty1,
369381
empty2;
382+
char flags1,
383+
flags2;
370384
RangeBound *result_lower;
371385
RangeBound *result_upper;
372386

373387
range_deserialize(typcache, r1, &lower1, &upper1, &empty1);
374388
range_deserialize(typcache, r2, &lower2, &upper2, &empty2);
389+
flags1 = range_get_flags(r1);
390+
flags2 = range_get_flags(r2);
375391

376392
if (empty1)
393+
{
394+
/* We can return r2 as-is if it already is or contains empty */
395+
if (flags2 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
396+
return r2;
397+
/* Else we'd better copy it (modify-in-place isn't safe) */
398+
r2 = rangeCopy(r2);
399+
range_set_contain_empty(r2);
377400
return r2;
401+
}
378402
if (empty2)
403+
{
404+
/* We can return r1 as-is if it already is or contains empty */
405+
if (flags1 & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY))
406+
return r1;
407+
/* Else we'd better copy it (modify-in-place isn't safe) */
408+
r1 = rangeCopy(r1);
409+
range_set_contain_empty(r1);
379410
return r1;
411+
}
380412

381413
if (range_cmp_bounds(typcache, &lower1, &lower2) <= 0)
382414
result_lower = &lower1;
@@ -389,12 +421,19 @@ range_super_union(TypeCacheEntry *typcache, RangeType * r1, RangeType * r2)
389421
result_upper = &upper2;
390422

391423
/* optimization to avoid constructing a new range */
392-
if (result_lower == &lower1 && result_upper == &upper1)
424+
if (result_lower == &lower1 && result_upper == &upper1 &&
425+
((flags1 & RANGE_CONTAIN_EMPTY) || !(flags2 & RANGE_CONTAIN_EMPTY)))
393426
return r1;
394-
if (result_lower == &lower2 && result_upper == &upper2)
427+
if (result_lower == &lower2 && result_upper == &upper2 &&
428+
((flags2 & RANGE_CONTAIN_EMPTY) || !(flags1 & RANGE_CONTAIN_EMPTY)))
395429
return r2;
396430

397-
return make_range(typcache, result_lower, result_upper, false);
431+
result = make_range(typcache, result_lower, result_upper, false);
432+
433+
if ((flags1 & RANGE_CONTAIN_EMPTY) || (flags2 & RANGE_CONTAIN_EMPTY))
434+
range_set_contain_empty(result);
435+
436+
return result;
398437
}
399438

400439
/*
@@ -484,21 +523,26 @@ range_gist_consistent_int(FmgrInfo *flinfo, StrategyNumber strategy,
484523
break;
485524
case RANGESTRAT_CONTAINED_BY:
486525
/*
487-
* Ideally we'd apply range_overlaps here, but at present it
488-
* might fail to find empty ranges in the index, which should
489-
* be reported as being contained by anything. This needs work.
526+
* Empty ranges are contained by anything, so if key is or
527+
* contains any empty ranges, we must descend into it. Otherwise,
528+
* descend only if key overlaps the query.
490529
*/
491-
return true;
530+
if (RangeIsOrContainsEmpty(key))
531+
return true;
532+
proc = range_overlaps;
492533
break;
493534
case RANGESTRAT_CONTAINS_ELEM:
494535
proc = range_contains_elem;
495536
break;
496537
case RANGESTRAT_EQ:
538+
/*
539+
* If query is empty, descend only if the key is or contains any
540+
* empty ranges. Otherwise, descend if key contains query.
541+
*/
542+
if (RangeIsEmpty(DatumGetRangeType(query)))
543+
return RangeIsOrContainsEmpty(key);
497544
proc = range_contains;
498545
break;
499-
case RANGESTRAT_NE:
500-
return true;
501-
break;
502546
default:
503547
elog(ERROR, "unrecognized range strategy: %d", strategy);
504548
proc = NULL; /* keep compiler quiet */
@@ -555,9 +599,6 @@ range_gist_consistent_leaf(FmgrInfo *flinfo, StrategyNumber strategy,
555599
case RANGESTRAT_EQ:
556600
proc = range_eq;
557601
break;
558-
case RANGESTRAT_NE:
559-
proc = range_ne;
560-
break;
561602
default:
562603
elog(ERROR, "unrecognized range strategy: %d", strategy);
563604
proc = NULL; /* keep compiler quiet */

src/include/catalog/catversion.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,6 @@
5353
*/
5454

5555
/* yyyymmddN */
56-
#define CATALOG_VERSION_NO 201111241
56+
#define CATALOG_VERSION_NO 201111271
5757

5858
#endif

src/include/catalog/pg_amop.h

-1
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,5 @@ DATA(insert ( 3919 3831 3831 7 s 3890 783 0 ));
736736
DATA(insert ( 3919 3831 3831 8 s 3892 783 0 ));
737737
DATA(insert ( 3919 3831 2283 16 s 3889 783 0 ));
738738
DATA(insert ( 3919 3831 3831 18 s 3882 783 0 ));
739-
DATA(insert ( 3919 3831 3831 19 s 3883 783 0 ));
740739

741740
#endif /* PG_AMOP_H */

src/include/utils/rangetypes.h

+13-8
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,15 @@ typedef struct
3333
#define RangeTypeGetOid(r) ((r)->rangetypid)
3434

3535
/* A range's flags byte contains these bits: */
36-
#define RANGE_EMPTY 0x01 /* range is empty */
37-
#define RANGE_LB_INC 0x02 /* lower bound is inclusive (vs exclusive) */
38-
#define RANGE_LB_NULL 0x04 /* lower bound is null (NOT CURRENTLY USED) */
39-
#define RANGE_LB_INF 0x08 /* lower bound is +/- infinity */
40-
#define RANGE_UB_INC 0x10 /* upper bound is inclusive (vs exclusive) */
41-
#define RANGE_UB_NULL 0x20 /* upper bound is null (NOT CURRENTLY USED) */
42-
#define RANGE_UB_INF 0x40 /* upper bound is +/- infinity */
36+
#define RANGE_EMPTY 0x01 /* range is empty */
37+
#define RANGE_LB_INC 0x02 /* lower bound is inclusive */
38+
#define RANGE_UB_INC 0x04 /* upper bound is inclusive */
39+
#define RANGE_LB_INF 0x08 /* lower bound is -infinity */
40+
#define RANGE_UB_INF 0x10 /* upper bound is +infinity */
41+
#define RANGE_LB_NULL 0x20 /* lower bound is null (NOT USED) */
42+
#define RANGE_UB_NULL 0x40 /* upper bound is null (NOT USED) */
43+
#define RANGE_CONTAIN_EMPTY 0x80 /* marks a GiST internal-page entry whose
44+
* subtree contains some empty ranges */
4345

4446
#define RANGE_HAS_LBOUND(flags) (!((flags) & (RANGE_EMPTY | \
4547
RANGE_LB_NULL | \
@@ -49,7 +51,9 @@ typedef struct
4951
RANGE_UB_NULL | \
5052
RANGE_UB_INF)))
5153

52-
#define RangeIsEmpty(r) (range_get_flags(r) & RANGE_EMPTY)
54+
#define RangeIsEmpty(r) ((range_get_flags(r) & RANGE_EMPTY) != 0)
55+
#define RangeIsOrContainsEmpty(r) \
56+
((range_get_flags(r) & (RANGE_EMPTY | RANGE_CONTAIN_EMPTY)) != 0)
5357

5458

5559
/* Internal representation of either bound of a range (not what's on disk) */
@@ -152,6 +156,7 @@ extern void range_deserialize(TypeCacheEntry *typcache, RangeType *range,
152156
RangeBound *lower, RangeBound *upper,
153157
bool *empty);
154158
extern char range_get_flags(RangeType *range);
159+
extern void range_set_contain_empty(RangeType *range);
155160
extern RangeType *make_range(TypeCacheEntry *typcache, RangeBound *lower,
156161
RangeBound *upper, bool empty);
157162
extern int range_cmp_bounds(TypeCacheEntry *typcache, RangeBound *b1,

src/test/regress/expected/opr_sanity.out

+1-2
Original file line numberDiff line numberDiff line change
@@ -1041,7 +1041,6 @@ ORDER BY 1, 2, 3;
10411041
783 | 15 | <->
10421042
783 | 16 | @>
10431043
783 | 18 | =
1044-
783 | 19 | <>
10451044
783 | 27 | @>
10461045
783 | 28 | <@
10471046
783 | 47 | @>
@@ -1054,7 +1053,7 @@ ORDER BY 1, 2, 3;
10541053
2742 | 2 | @@@
10551054
2742 | 3 | <@
10561055
2742 | 4 | =
1057-
(44 rows)
1056+
(43 rows)
10581057

10591058
-- Check that all opclass search operators have selectivity estimators.
10601059
-- This is not absolutely required, but it seems a reasonable thing

0 commit comments

Comments
 (0)