Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 656beff

Browse files
committed
Adjust string comparison so that only bitwise-equal strings are considered
equal: if strcoll claims two strings are equal, check it with strcmp, and sort according to strcmp if not identical. This fixes inconsistent behavior under glibc's hu_HU locale, and probably under some other locales as well. Also, take advantage of the now-well-defined behavior to speed up texteq, textne, bpchareq, bpcharne: they may as well just do a bitwise comparison and not bother with strcoll at all. NOTE: affected databases may need to REINDEX indexes on text columns to be sure they are self-consistent.
1 parent 7b53b45 commit 656beff

File tree

3 files changed

+40
-15
lines changed

3 files changed

+40
-15
lines changed

src/backend/access/hash/hashfunc.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.45 2005/10/15 02:49:08 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.46 2005/12/22 22:50:00 tgl Exp $
1212
*
1313
* NOTES
1414
* These functions are stored in pg_amproc. For each operator class
@@ -138,9 +138,9 @@ hashtext(PG_FUNCTION_ARGS)
138138
Datum result;
139139

140140
/*
141-
* Note: this is currently identical in behavior to hashvarlena, but it
142-
* seems likely that we may need to do something different in non-C
143-
* locales. (See also hashbpchar, if so.)
141+
* Note: this is currently identical in behavior to hashvarlena, but
142+
* keep it as a separate function in case we someday want to do something
143+
* different in non-C locales. (See also hashbpchar, if so.)
144144
*/
145145
result = hash_any((unsigned char *) VARDATA(key),
146146
VARSIZE(key) - VARHDRSZ);

src/backend/utils/adt/varchar.c

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.113 2005/10/15 02:49:30 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.114 2005/12/22 22:50:00 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -614,11 +614,14 @@ bpchareq(PG_FUNCTION_ARGS)
614614
len1 = bcTruelen(arg1);
615615
len2 = bcTruelen(arg2);
616616

617-
/* fast path for different-length inputs */
617+
/*
618+
* Since we only care about equality or not-equality, we can avoid all
619+
* the expense of strcoll() here, and just do bitwise comparison.
620+
*/
618621
if (len1 != len2)
619622
result = false;
620623
else
621-
result = (varstr_cmp(VARDATA(arg1), len1, VARDATA(arg2), len2) == 0);
624+
result = (strncmp(VARDATA(arg1), VARDATA(arg2), len1) == 0);
622625

623626
PG_FREE_IF_COPY(arg1, 0);
624627
PG_FREE_IF_COPY(arg2, 1);
@@ -638,11 +641,14 @@ bpcharne(PG_FUNCTION_ARGS)
638641
len1 = bcTruelen(arg1);
639642
len2 = bcTruelen(arg2);
640643

641-
/* fast path for different-length inputs */
644+
/*
645+
* Since we only care about equality or not-equality, we can avoid all
646+
* the expense of strcoll() here, and just do bitwise comparison.
647+
*/
642648
if (len1 != len2)
643649
result = true;
644650
else
645-
result = (varstr_cmp(VARDATA(arg1), len1, VARDATA(arg2), len2) != 0);
651+
result = (strncmp(VARDATA(arg1), VARDATA(arg2), len1) != 0);
646652

647653
PG_FREE_IF_COPY(arg1, 0);
648654
PG_FREE_IF_COPY(arg2, 1);
@@ -789,7 +795,9 @@ bpchar_smaller(PG_FUNCTION_ARGS)
789795
* bpchar needs a specialized hash function because we want to ignore
790796
* trailing blanks in comparisons.
791797
*
792-
* XXX is there any need for locale-specific behavior here?
798+
* Note: currently there is no need for locale-specific behavior here,
799+
* but if we ever change the semantics of bpchar comparison to trust
800+
* strcoll() completely, we'd need to do something different in non-C locales.
793801
*/
794802
Datum
795803
hashbpchar(PG_FUNCTION_ARGS)

src/backend/utils/adt/varlena.c

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.141 2005/11/22 18:17:23 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.142 2005/12/22 22:50:00 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -938,6 +938,15 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2)
938938

939939
result = strcoll(a1p, a2p);
940940

941+
/*
942+
* In some locales strcoll() can claim that nonidentical strings are
943+
* equal. Believing that would be bad news for a number of reasons,
944+
* so we follow Perl's lead and sort "equal" strings according to
945+
* strcmp().
946+
*/
947+
if (result == 0)
948+
result = strcmp(a1p, a2p);
949+
941950
if (a1p != a1buf)
942951
pfree(a1p);
943952
if (a2p != a2buf)
@@ -984,11 +993,15 @@ texteq(PG_FUNCTION_ARGS)
984993
text *arg2 = PG_GETARG_TEXT_P(1);
985994
bool result;
986995

987-
/* fast path for different-length inputs */
996+
/*
997+
* Since we only care about equality or not-equality, we can avoid all
998+
* the expense of strcoll() here, and just do bitwise comparison.
999+
*/
9881000
if (VARSIZE(arg1) != VARSIZE(arg2))
9891001
result = false;
9901002
else
991-
result = (text_cmp(arg1, arg2) == 0);
1003+
result = (strncmp(VARDATA(arg1), VARDATA(arg2),
1004+
VARSIZE(arg1) - VARHDRSZ) == 0);
9921005

9931006
PG_FREE_IF_COPY(arg1, 0);
9941007
PG_FREE_IF_COPY(arg2, 1);
@@ -1003,11 +1016,15 @@ textne(PG_FUNCTION_ARGS)
10031016
text *arg2 = PG_GETARG_TEXT_P(1);
10041017
bool result;
10051018

1006-
/* fast path for different-length inputs */
1019+
/*
1020+
* Since we only care about equality or not-equality, we can avoid all
1021+
* the expense of strcoll() here, and just do bitwise comparison.
1022+
*/
10071023
if (VARSIZE(arg1) != VARSIZE(arg2))
10081024
result = true;
10091025
else
1010-
result = (text_cmp(arg1, arg2) != 0);
1026+
result = (strncmp(VARDATA(arg1), VARDATA(arg2),
1027+
VARSIZE(arg1) - VARHDRSZ) != 0);
10111028

10121029
PG_FREE_IF_COPY(arg1, 0);
10131030
PG_FREE_IF_COPY(arg2, 1);

0 commit comments

Comments
 (0)