Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 22505f4

Browse files
committed
Add thesaurus dictionary which can replace N>0 lexemes by M>0 lexemes.
It required some changes in lexize algorithm, but interface with dictionaries stays compatible with old dictionaries. Funded by Georgia Public Library Service and LibLime, Inc.
1 parent 3b7ed9b commit 22505f4

13 files changed

+1257
-129
lines changed

contrib/tsearch2/Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.14 2006/05/02 11:28:54 teodor Exp $
1+
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.15 2006/05/31 14:05:31 teodor Exp $
22

33
MODULE_big = tsearch2
44
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o \
5-
dict_snowball.o dict_ispell.o dict_syn.o \
5+
dict_snowball.o dict_ispell.o dict_syn.o dict_thesaurus.o \
66
wparser.o wparser_def.o \
77
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o \
88
tsvector_op.o rank.o ts_stat.o \
99
query_util.o query_support.o query_rewrite.o query_gist.o \
10-
ts_locale.o ginidx.o
10+
ts_locale.o ts_lexize.o ginidx.o
1111

1212
SUBDIRS := snowball ispell wordparser
1313
SUBDIROBJS := $(SUBDIRS:%=%/SUBSYS.o)
@@ -16,7 +16,7 @@ OBJS += $(SUBDIROBJS)
1616

1717
PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser
1818

19-
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
19+
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 thesaurus
2020
DATA_built = tsearch2.sql untsearch2.sql
2121
DOCS = README.tsearch2
2222
REGRESS = tsearch2

contrib/tsearch2/common.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "catalog/pg_proc.h"
66
#include "catalog/pg_namespace.h"
77
#include "utils/syscache.h"
8+
#include "miscadmin.h"
89

910
#include "ts_cfg.h"
1011
#include "dict.h"
@@ -163,3 +164,23 @@ get_oidnamespace(Oid funcoid)
163164

164165
return nspoid;
165166
}
167+
168+
/* if path is relative, take it as relative to share dir */
169+
char *
170+
to_absfilename(char *filename) {
171+
if (!is_absolute_path(filename)) {
172+
char sharepath[MAXPGPATH];
173+
char *absfn;
174+
#ifdef WIN32
175+
char delim = '\\';
176+
#else
177+
char delim = '/';
178+
#endif
179+
get_share_path(my_exec_path, sharepath);
180+
absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
181+
sprintf(absfn, "%s%c%s", sharepath, delim, filename);
182+
filename = absfn;
183+
}
184+
185+
return filename;
186+
}

contrib/tsearch2/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ text *mtextdup(text *in);
1616

1717
int text_cmp(text *a, text *b);
1818

19+
char * to_absfilename(char *filename);
20+
1921
#define NEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
2022
#define ARRNELEMS(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
2123

contrib/tsearch2/dict.c

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.11 2006/03/11 04:38:30 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.12 2006/05/31 14:05:31 teodor Exp $ */
22

33
/*
44
* interface functions to dictionary
@@ -50,16 +50,19 @@ init_dict(Oid id, DictInfo * dict)
5050
Datum opt;
5151
Oid oid = InvalidOid;
5252

53+
/* setup dictlexize method */
54+
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
55+
if (isnull || oid == InvalidOid)
56+
ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
57+
fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
58+
59+
/* setup and call dictinit method, optinally */
5360
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
5461
if (!(isnull || oid == InvalidOid))
5562
{
5663
opt = SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 2, &isnull);
5764
dict->dictionary = (void *) DatumGetPointer(OidFunctionCall1(oid, opt));
5865
}
59-
oid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 3, &isnull));
60-
if (isnull || oid == InvalidOid)
61-
ts_error(ERROR, "Null dict_lexize for dictonary %d", id);
62-
fmgr_info_cxt(oid, &(dict->lexize_info), TopMemoryContext);
6366
dict->dict_id = id;
6467
}
6568
else
@@ -98,6 +101,29 @@ comparedict(const void *a, const void *b)
98101
return (((DictInfo *) a)->dict_id < ((DictInfo *) b)->dict_id) ? -1 : 1;
99102
}
100103

104+
static void
105+
insertdict(Oid id) {
106+
DictInfo newdict;
107+
108+
if (DList.len == DList.reallen)
109+
{
110+
DictInfo *tmp;
111+
int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
112+
113+
tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
114+
if (!tmp)
115+
ts_error(ERROR, "No memory");
116+
DList.reallen = reallen;
117+
DList.list = tmp;
118+
}
119+
init_dict(id, &newdict);
120+
121+
DList.list[DList.len] = newdict;
122+
DList.len++;
123+
124+
qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
125+
}
126+
101127
DictInfo *
102128
finddict(Oid id)
103129
{
@@ -117,23 +143,8 @@ finddict(Oid id)
117143
return DList.last_dict;
118144
}
119145

120-
/* last chance */
121-
if (DList.len == DList.reallen)
122-
{
123-
DictInfo *tmp;
124-
int reallen = (DList.reallen) ? 2 * DList.reallen : 16;
125-
126-
tmp = (DictInfo *) realloc(DList.list, sizeof(DictInfo) * reallen);
127-
if (!tmp)
128-
ts_error(ERROR, "No memory");
129-
DList.reallen = reallen;
130-
DList.list = tmp;
131-
}
132-
DList.last_dict = &(DList.list[DList.len]);
133-
init_dict(id, DList.last_dict);
134-
135-
DList.len++;
136-
qsort(DList.list, DList.len, sizeof(DictInfo), comparedict);
146+
/* insert new dictionary */
147+
insertdict(id);
137148
return finddict(id); /* qsort changed order!! */ ;
138149
}
139150

@@ -190,17 +201,32 @@ lexize(PG_FUNCTION_ARGS)
190201
*ptr;
191202
Datum *da;
192203
ArrayType *a;
204+
DictSubState dstate = { false, false, NULL };
193205

194206
SET_FUNCOID();
195207
dict = finddict(PG_GETARG_OID(0));
196208

197209
ptr = res = (TSLexeme *) DatumGetPointer(
198-
FunctionCall3(&(dict->lexize_info),
210+
FunctionCall4(&(dict->lexize_info),
211+
PointerGetDatum(dict->dictionary),
212+
PointerGetDatum(VARDATA(in)),
213+
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
214+
PointerGetDatum(&dstate)
215+
)
216+
);
217+
218+
if (dstate.getnext) {
219+
dstate.isend = true;
220+
ptr = res = (TSLexeme *) DatumGetPointer(
221+
FunctionCall4(&(dict->lexize_info),
199222
PointerGetDatum(dict->dictionary),
200223
PointerGetDatum(VARDATA(in)),
201-
Int32GetDatum(VARSIZE(in) - VARHDRSZ)
224+
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
225+
PointerGetDatum(&dstate)
202226
)
203227
);
228+
}
229+
204230
PG_FREE_IF_COPY(in, 1);
205231
if (!res)
206232
{

contrib/tsearch2/dict.h

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.6 2006/03/11 04:38:30 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.7 2006/05/31 14:05:31 teodor Exp $ */
22

33
#ifndef __DICT_H__
44
#define __DICT_H__
55
#include "postgres.h"
66
#include "fmgr.h"
7+
#include "ts_cfg.h"
78

89
typedef struct
910
{
@@ -29,6 +30,11 @@ DictInfo *finddict(Oid id);
2930
Oid name2id_dict(text *name);
3031
void reset_dict(void);
3132

33+
typedef struct {
34+
bool isend; /* in: marks for lexize_info about text end is reached */
35+
bool getnext; /* out: dict wants next lexeme */
36+
void *private; /* internal dict state between calls with getnext == true */
37+
} DictSubState;
3238

3339
/* simple parser of cfg string */
3440
typedef struct
@@ -45,17 +51,61 @@ typedef struct
4551
/*
4652
* number of variant of split word , for example Word 'fotballklubber'
4753
* (norwegian) has two varian to split: ( fotball, klubb ) and ( fot,
48-
* ball, klubb ). So, dictionary should return: nvariant lexeme 1
49-
* fotball 1 klubb 2 fot 2 ball 2 klubb
50-
*
54+
* ball, klubb ). So, dictionary should return:
55+
* nvariant lexeme
56+
* 1 fotball
57+
* 1 klubb
58+
* 2 fot
59+
* 2 ball
60+
* 2 klubb
5161
*/
5262
uint16 nvariant;
5363

54-
/* currently unused */
5564
uint16 flags;
5665

5766
/* C-string */
5867
char *lexeme;
5968
} TSLexeme;
6069

70+
#define TSL_ADDPOS 0x01
71+
72+
73+
/*
74+
* Lexize subsystem
75+
*/
76+
77+
typedef struct ParsedLex {
78+
int type;
79+
char *lemm;
80+
int lenlemm;
81+
bool resfollow;
82+
struct ParsedLex *next;
83+
} ParsedLex;
84+
85+
typedef struct ListParsedLex {
86+
ParsedLex *head;
87+
ParsedLex *tail;
88+
} ListParsedLex;
89+
90+
typedef struct {
91+
TSCfgInfo *cfg;
92+
Oid curDictId;
93+
int posDict;
94+
DictSubState dictState;
95+
ParsedLex *curSub;
96+
ListParsedLex towork; /* current list to work */
97+
ListParsedLex waste; /* list of lexemes that already lexized */
98+
99+
/* fields to store last variant to lexize (basically, thesaurus
100+
or similar to, which wants several lexemes */
101+
102+
ParsedLex *lastRes;
103+
TSLexeme *tmpRes;
104+
} LexizeData;
105+
106+
107+
void LexizeInit(LexizeData *ld, TSCfgInfo *cfg);
108+
void LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm);
109+
TSLexeme* LexizeExec(LexizeData *ld, ParsedLex **correspondLexem);
110+
61111
#endif

0 commit comments

Comments
 (0)