Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 866bad9

Browse files
committed
Add a rank/(rank+1) normalization option to ts_rank(). While the usefulness
of this seems a bit marginal, if it's useful enough to be shown in the manual then we probably ought to support doing it without double evaluation of the ts_rank function. Per my proposal earlier today.
1 parent 5858990 commit 866bad9

File tree

2 files changed

+32
-15
lines changed

2 files changed

+32
-15
lines changed

doc/src/sgml/textsearch.sgml

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.33 2007/11/14 18:36:37 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.34 2007/11/14 23:43:27 tgl Exp $ -->
22

33
<chapter id="textsearch">
44
<title id="textsearch-title">Full Text Search</title>
@@ -940,6 +940,7 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
940940
<listitem>
941941
<para>
942942
4 divides the rank by the mean harmonic distance between extents
943+
(this is implemented only by <function>ts_rank_cd</>)
943944
</para>
944945
</listitem>
945946
<listitem>
@@ -953,17 +954,24 @@ SELECT plainto_tsquery('english', 'The Fat &amp; Rats:C');
953954
of unique words in document
954955
</para>
955956
</listitem>
957+
<listitem>
958+
<para>
959+
32 divides the rank by itself + 1
960+
</para>
961+
</listitem>
956962
</itemizedlist>
957963

964+
If more than one flag bit is specified, the transformations are
965+
applied in the order listed.
958966
</para>
959967

960968
<para>
961969
It is important to note that the ranking functions do not use any global
962-
information so it is impossible to produce a fair normalization to 1% or
963-
100%, as sometimes desired. However, a simple technique like
964-
<literal>rank/(rank+1)</literal> can be applied. Of course, this is just
965-
a cosmetic change, i.e., the ordering of the search results will not
966-
change.
970+
information, so it is impossible to produce a fair normalization to 1% or
971+
100% as sometimes desired. Normalization option 32
972+
(<literal>rank/(rank+1)</literal>) can be applied to scale all ranks
973+
into the range zero to one, but of course this is just a cosmetic change;
974+
it will not affect the ordering of the search results.
967975
</para>
968976

969977
<para>
@@ -991,7 +999,7 @@ ORDER BY rank DESC LIMIT 10;
991999
This is the same example using normalized ranking:
9921000

9931001
<programlisting>
994-
SELECT title, ts_rank_cd(textsearch, query)/(ts_rank_cd(textsearch, query) + 1) AS rank
1002+
SELECT title, ts_rank_cd(textsearch, query, 32 /* rank/(rank+1) */ ) AS rank
9951003
FROM apod, to_tsquery('neutrino|(dark &amp; matter)') query
9961004
WHERE query @@ textsearch
9971005
ORDER BY rank DESC LIMIT 10;

src/backend/utils/adt/tsrank.c

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.8 2007/09/20 18:10:57 teodor Exp $
10+
* $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.9 2007/11/14 23:43:27 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -25,13 +25,14 @@ static float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
2525

2626
#define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
2727

28-
#define RANK_NO_NORM 0x00
28+
#define RANK_NO_NORM 0x00
2929
#define RANK_NORM_LOGLENGTH 0x01
30-
#define RANK_NORM_LENGTH 0x02
31-
#define RANK_NORM_EXTDIST 0x04
32-
#define RANK_NORM_UNIQ 0x08
33-
#define RANK_NORM_LOGUNIQ 0x10
34-
#define DEF_NORM_METHOD RANK_NO_NORM
30+
#define RANK_NORM_LENGTH 0x02
31+
#define RANK_NORM_EXTDIST 0x04
32+
#define RANK_NORM_UNIQ 0x08
33+
#define RANK_NORM_LOGUNIQ 0x10
34+
#define RANK_NORM_RDIVRPLUS1 0x20
35+
#define DEF_NORM_METHOD RANK_NO_NORM
3536

3637
static float calc_rank_or(float *w, TSVector t, TSQuery q);
3738
static float calc_rank_and(float *w, TSVector t, TSQuery q);
@@ -348,12 +349,17 @@ calc_rank(float *w, TSVector t, TSQuery q, int4 method)
348349
res /= (float) len;
349350
}
350351

352+
/* RANK_NORM_EXTDIST not applicable */
353+
351354
if ((method & RANK_NORM_UNIQ) && t->size > 0)
352355
res /= (float) (t->size);
353356

354357
if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
355358
res /= log((double) (t->size + 1)) / log(2.0);
356359

360+
if (method & RANK_NORM_RDIVRPLUS1)
361+
res /= (res + 1);
362+
357363
return res;
358364
}
359365

@@ -762,7 +768,7 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
762768
Wdoc /= (double) len;
763769
}
764770

765-
if ((method & RANK_NORM_EXTDIST) && SumDist > 0)
771+
if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
766772
Wdoc /= ((double) NExtent) / SumDist;
767773

768774
if ((method & RANK_NORM_UNIQ) && txt->size > 0)
@@ -771,6 +777,9 @@ calc_rank_cd(float4 *arrdata, TSVector txt, TSQuery query, int method)
771777
if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
772778
Wdoc /= log((double) (txt->size + 1)) / log(2.0);
773779

780+
if (method & RANK_NORM_RDIVRPLUS1)
781+
Wdoc /= (Wdoc + 1);
782+
774783
pfree(doc);
775784

776785
pfree( qr.operandexist );

0 commit comments

Comments
 (0)