postgrespro
diff --git a/‎doc/src/sgml/catalogs.sgml
Lines changed: 2 additions & 2 deletions b/‎doc/src/sgml/catalogs.sgml
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/src/sgml/xfunc.sgml
Lines changed: 5 additions & 6 deletions b/‎doc/src/sgml/xfunc.sgml
Lines changed: 5 additions & 6 deletions
diff --git a/‎doc/src/sgml/xoper.sgml
Lines changed: 23 additions & 35 deletions b/‎doc/src/sgml/xoper.sgml
Lines changed: 23 additions & 35 deletions
diff --git a/‎src/backend/access/hash/hashfunc.c
Lines changed: 38 additions & 6 deletions b/‎src/backend/access/hash/hashfunc.c
Lines changed: 38 additions & 6 deletions
diff --git a/‎src/backend/executor/execGrouping.c
Lines changed: 51 additions & 72 deletions b/‎src/backend/executor/execGrouping.c
Lines changed: 51 additions & 72 deletions
@@ -1,6 +1,6 @@
 <!--
  Documentation of the system catalogs, directed toward PostgreSQL developers
- $Header: /cvsroot/pgsql/doc/src/sgml/catalogs.sgml,v 2.71 2003/05/28 16:03:55 tgl Exp $
+ $Header: /cvsroot/pgsql/doc/src/sgml/catalogs.sgml,v 2.72 2003/06/22 22:04:54 tgl Exp $
  -->
 
 <chapter id="catalogs">
@@ -2525,7 +2525,7 @@
       <entry><structfield>oprcanhash</structfield></entry>
       <entry><type>bool</type></entry>
       <entry></entry>
-      <entry>This operator supports hash joins.</entry>
+      <entry>This operator supports hash joins</entry>
      </row>
 
      <row>
 
@@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/xfunc.sgml,v 1.68 2003/05/29 20:40:36 tgl Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/xfunc.sgml,v 1.69 2003/06/22 22:04:54 tgl Exp $
 -->
 
  <sect1 id="xfunc">
@@ -1442,11 +1442,10 @@ concat_text(PG_FUNCTION_ARGS)
       <listitem>
        <para>
         Always zero the bytes of your structures using
-        <function>memset</function> or <function>bzero</function>.
-        Several routines (such as the hash access method, hash joins,
-        and the sort algorithm) compute functions of the raw bits
-        contained in your structure.  Even if you initialize all
-        fields of your structure, there may be several bytes of
+	<function>memset</function>.  Without this, it's difficult to
+	support hash indexes or hash joins, as you must pick out only
+	the significant bits of your data structure to compute a hash.
+        Even if you initialize all fields of your structure, there may be
         alignment padding (holes in the structure) that may contain
         garbage values.
        </para>
 
@@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/xoper.sgml,v 1.23 2003/04/10 01:22:45 petere Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/xoper.sgml,v 1.24 2003/06/22 22:04:54 tgl Exp $
 -->
 
  <sect1 id="xoper">
@@ -315,46 +315,34 @@ table1.column1 OP table2.column2
      same hash code.  If two values get put in different hash buckets, the
      join will never compare them at all, implicitly assuming that the
      result of the join operator must be false.  So it never makes sense
-     to specify <literal>HASHES</literal> for operators that do not represent equality.
+     to specify <literal>HASHES</literal> for operators that do not represent
+     equality.
     </para>
 
     <para>
-     In fact, logical equality is not good enough either; the operator
-     had better represent pure bitwise equality, because the hash
-     function will be computed on the memory representation of the
-     values regardless of what the bits mean.  For example, the
-     polygon operator <literal>~=</literal>, which checks whether two
-     polygons are the same, is not bitwise equality, because two
-     polygons can be considered the same even if their vertices are
-     specified in a different order.  What this means is that a join
-     using <literal>~=</literal> between polygon fields would yield
-     different results if implemented as a hash join than if
-     implemented another way, because a large fraction of the pairs
-     that should match will hash to different values and will never be
-     compared by the hash join.  But if the optimizer chooses to use a
-     different kind of join, all the pairs that the operator
-     <literal>~=</literal> says are the same will be found.  We don't
-     want that kind of inconsistency, so we don't mark the polygon
-     operator <literal>~=</literal> as hashable.
+     To be marked <literal>HASHES</literal>, the join operator must appear
+     in a hash index operator class.  This is not enforced when you create
+     the operator, since of course the referencing operator class couldn't
+     exist yet.  But attempts to use the operator in hash joins will fail
+     at runtime if no such operator class exists.  The system needs the
+     operator class to find the datatype-specific hash function for the
+     operator's input datatype.  Of course, you must also supply a suitable
+     hash function before you can create the operator class.
     </para>
 
     <para>
-     There are also machine-dependent ways in which a hash join might fail
-     to do the right thing.  For example, if your data type
-     is a structure in which there may be uninteresting pad bits, it's unsafe
-     to mark the equality operator <literal>HASHES</>.  (Unless you write
-     your other operators and functions to ensure that the unused bits are always zero, which is the recommended strategy.)
-     Another example is that the floating-point data types are unsafe for hash
-     joins.  On machines that meet the <acronym>IEEE</> floating-point standard, negative
-     zero and positive zero are different values (different bit patterns) but
-     they are defined to compare equal.  So, if the equality operator on floating-point data types were marked
-     <literal>HASHES</>, a negative zero and a positive zero would probably not be matched up
-     by a hash join, but they would be matched up by any other join process.
-    </para>
-
-    <para>
-     The bottom line is that you should probably only use <literal>HASHES</literal> for
-     equality operators that are (or could be) implemented by <function>memcmp()</function>.
+     Care should be exercised when preparing a hash function, because there
+     are machine-dependent ways in which it might fail to do the right thing.
+     For example, if your data type is a structure in which there may be
+     uninteresting pad bits, you can't simply pass the whole structure to
+     <function>hash_any</>.  (Unless you write your other operators and
+     functions to ensure that the unused bits are always zero, which is the
+     recommended strategy.)
+     Another example is that on machines that meet the <acronym>IEEE</>
+     floating-point standard, negative zero and positive zero are different
+     values (different bit patterns) but they are defined to compare equal.
+     If a float value might contain negative zero then extra steps are needed
+     to ensure it generates the same hash value as positive zero.
     </para>
 
     <note>
 
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashfunc.c,v 1.35 2002/09/04 20:31:09 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashfunc.c,v 1.36 2003/06/22 22:04:54 tgl Exp $
  *
  * NOTES
  *	  These functions are stored in pg_amproc.	For each operator class
@@ -22,6 +22,7 @@
 #include "access/hash.h"
 
 
+/* Note: this is used for both "char" and boolean datatypes */
 Datum
 hashchar(PG_FUNCTION_ARGS)
 {
@@ -58,6 +59,14 @@ hashfloat4(PG_FUNCTION_ARGS)
 {
 	float4		key = PG_GETARG_FLOAT4(0);
 
+	/*
+	 * On IEEE-float machines, minus zero and zero have different bit patterns
+	 * but should compare as equal.  We must ensure that they have the same
+	 * hash value, which is most easily done this way:
+	 */
+	if (key == (float4) 0)
+		PG_RETURN_UINT32(0);
+
 	return hash_any((unsigned char *) &key, sizeof(key));
 }
 
@@ -66,6 +75,14 @@ hashfloat8(PG_FUNCTION_ARGS)
 {
 	float8		key = PG_GETARG_FLOAT8(0);
 
+	/*
+	 * On IEEE-float machines, minus zero and zero have different bit patterns
+	 * but should compare as equal.  We must ensure that they have the same
+	 * hash value, which is most easily done this way:
+	 */
+	if (key == (float8) 0)
+		PG_RETURN_UINT32(0);
+
 	return hash_any((unsigned char *) &key, sizeof(key));
 }
 
@@ -77,11 +94,6 @@ hashoidvector(PG_FUNCTION_ARGS)
 	return hash_any((unsigned char *) key, INDEX_MAX_KEYS * sizeof(Oid));
 }
 
-/*
- * Note: hashint2vector currently can't be used as a user hash table
- * hash function, because it has no pg_proc entry.	We only need it
- * for catcache indexing.
- */
 Datum
 hashint2vector(PG_FUNCTION_ARGS)
 {
@@ -102,6 +114,26 @@ hashname(PG_FUNCTION_ARGS)
 	return hash_any((unsigned char *) key, keylen);
 }
 
+Datum
+hashtext(PG_FUNCTION_ARGS)
+{
+	text	   *key = PG_GETARG_TEXT_P(0);
+	Datum		result;
+
+	/*
+	 * Note: this is currently identical in behavior to hashvarlena,
+	 * but it seems likely that we may need to do something different
+	 * in non-C locales.  (See also hashbpchar, if so.)
+	 */
+	result = hash_any((unsigned char *) VARDATA(key),
+					  VARSIZE(key) - VARHDRSZ);
+
+	/* Avoid leaking memory for toasted inputs */
+	PG_FREE_IF_COPY(key, 0);
+
+	return result;
+}
+
 /*
  * hashvarlena() can be used for any varlena datatype in which there are
  * no non-significant bits, ie, distinct bitpatterns never compare as equal.
 
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/execGrouping.c,v 1.2 2003/01/12 04:03:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/execGrouping.c,v 1.3 2003/06/22 22:04:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,8 @@
 #include "executor/executor.h"
 #include "parser/parse_oper.h"
 #include "utils/memutils.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
 
 
 /*****************************************************************************
@@ -213,76 +215,46 @@ execTuplesMatchPrepare(TupleDesc tupdesc,
 	return eqfunctions;
 }
 
-
-/*****************************************************************************
- *		Utility routines for hashing
- *****************************************************************************/
-
 /*
- * ComputeHashFunc
+ * execTuplesHashPrepare
+ *		Look up the equality and hashing functions needed for a TupleHashTable.
  *
- *		the hash function for hash joins (also used for hash aggregation)
- *
- *		XXX this probably ought to be replaced with datatype-specific
- *		hash functions, such as those already implemented for hash indexes.
+ * This is similar to execTuplesMatchPrepare, but we also need to find the
+ * hash functions associated with the equality operators.  *eqfunctions and
+ * *hashfunctions receive the palloc'd result arrays.
  */
-uint32
-ComputeHashFunc(Datum key, int typLen, bool byVal)
+void
+execTuplesHashPrepare(TupleDesc tupdesc,
+					  int numCols,
+					  AttrNumber *matchColIdx,
+					  FmgrInfo **eqfunctions,
+					  FmgrInfo **hashfunctions)
 {
-	unsigned char *k;
+	int			i;
 
-	if (byVal)
-	{
-		/*
-		 * If it's a by-value data type, just hash the whole Datum value.
-		 * This assumes that datatypes narrower than Datum are
-		 * consistently padded (either zero-extended or sign-extended, but
-		 * not random bits) to fill Datum; see the XXXGetDatum macros in
-		 * postgres.h. NOTE: it would not work to do hash_any(&key, len)
-		 * since this would get the wrong bytes on a big-endian machine.
-		 */
-		k = (unsigned char *) &key;
-		typLen = sizeof(Datum);
-	}
-	else
+	*eqfunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo));
+	*hashfunctions = (FmgrInfo *) palloc(numCols * sizeof(FmgrInfo));
+
+	for (i = 0; i < numCols; i++)
 	{
-		if (typLen > 0)
-		{
-			/* fixed-width pass-by-reference type */
-			k = (unsigned char *) DatumGetPointer(key);
-		}
-		else if (typLen == -1)
-		{
-			/*
-			 * It's a varlena type, so 'key' points to a "struct varlena".
-			 * NOTE: VARSIZE returns the "real" data length plus the
-			 * sizeof the "vl_len" attribute of varlena (the length
-			 * information). 'key' points to the beginning of the varlena
-			 * struct, so we have to use "VARDATA" to find the beginning
-			 * of the "real" data.	Also, we have to be careful to detoast
-			 * the datum if it's toasted.  (We don't worry about freeing
-			 * the detoasted copy; that happens for free when the
-			 * per-tuple memory context is reset in ExecHashGetBucket.)
-			 */
-			struct varlena *vkey = PG_DETOAST_DATUM(key);
-
-			typLen = VARSIZE(vkey) - VARHDRSZ;
-			k = (unsigned char *) VARDATA(vkey);
-		}
-		else if (typLen == -2)
-		{
-			/* It's a null-terminated C string */
-			typLen = strlen(DatumGetCString(key)) + 1;
-			k = (unsigned char *) DatumGetPointer(key);
-		}
-		else
-		{
-			elog(ERROR, "ComputeHashFunc: Invalid typLen %d", typLen);
-			k = NULL;			/* keep compiler quiet */
-		}
+		AttrNumber	att = matchColIdx[i];
+		Oid			typid = tupdesc->attrs[att - 1]->atttypid;
+		Operator	optup;
+		Oid			eq_opr;
+		Oid			eq_function;
+		Oid			hash_function;
+
+		optup = equality_oper(typid, false);
+		eq_opr = oprid(optup);
+		eq_function = oprfuncid(optup);
+		ReleaseSysCache(optup);
+		hash_function = get_op_hash_function(eq_opr);
+		if (!OidIsValid(hash_function))
+			elog(ERROR, "Could not find hash function for hash operator %u",
+				 eq_opr);
+		fmgr_info(eq_function, &(*eqfunctions)[i]);
+		fmgr_info(hash_function, &(*hashfunctions)[i]);
 	}
-
-	return DatumGetUInt32(hash_any(k, typLen));
 }
 
 
@@ -299,19 +271,21 @@ ComputeHashFunc(Datum key, int typLen, bool byVal)
  *
  *	numCols, keyColIdx: identify the tuple fields to use as lookup key
  *	eqfunctions: equality comparison functions to use
+ *	hashfunctions: datatype-specific hashing functions to use
  *	nbuckets: number of buckets to make
  *	entrysize: size of each entry (at least sizeof(TupleHashEntryData))
  *	tablecxt: memory context in which to store table and table entries
  *	tempcxt: short-lived context for evaluation hash and comparison functions
  *
- * The eqfunctions array may be made with execTuplesMatchPrepare().
+ * The function arrays may be made with execTuplesHashPrepare().
  *
- * Note that keyColIdx and eqfunctions must be allocated in storage that
- * will live as long as the hashtable does.
+ * Note that keyColIdx, eqfunctions, and hashfunctions must be allocated in
+ * storage that will live as long as the hashtable does.
  */
 TupleHashTable
 BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 					FmgrInfo *eqfunctions,
+					FmgrInfo *hashfunctions,
 					int nbuckets, Size entrysize,
 					MemoryContext tablecxt, MemoryContext tempcxt)
 {
@@ -328,6 +302,7 @@ BuildTupleHashTable(int numCols, AttrNumber *keyColIdx,
 	hashtable->numCols = numCols;
 	hashtable->keyColIdx = keyColIdx;
 	hashtable->eqfunctions = eqfunctions;
+	hashtable->hashfunctions = hashfunctions;
 	hashtable->tablecxt = tablecxt;
 	hashtable->tempcxt = tempcxt;
 	hashtable->entrysize = entrysize;
@@ -375,11 +350,15 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 		hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0);
 
 		attr = heap_getattr(tuple, att, tupdesc, &isNull);
-		if (isNull)
-			continue;			/* treat nulls as having hash key 0 */
-		hashkey ^= ComputeHashFunc(attr,
-								   (int) tupdesc->attrs[att - 1]->attlen,
-								   tupdesc->attrs[att - 1]->attbyval);
+
+		if (!isNull)			/* treat nulls as having hash key 0 */
+		{
+			uint32		hkey;
+
+			hkey = DatumGetUInt32(FunctionCall1(&hashtable->hashfunctions[i],
+												attr));
+			hashkey ^= hkey;
+		}
 	}
 	bucketno = hashkey % (uint32) hashtable->nbuckets;