Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 92bcb5a

Browse files
committed
Allow do not lexize words in substitution.
Docs will be submitted some later, now it's at http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
1 parent 63e464a commit 92bcb5a

File tree

2 files changed

+69
-30
lines changed

2 files changed

+69
-30
lines changed

contrib/tsearch2/dict_thesaurus.c

+60-23
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */
22

33
/*
44
* thesaurus
@@ -13,6 +13,11 @@
1313
#include "common.h"
1414
#include "ts_locale.h"
1515

16+
/*
17+
* Temporay we use TSLexeme.flags for inner use...
18+
*/
19+
#define DT_USEASIS 0x1000
20+
1621
typedef struct LexemeInfo {
1722
uint16 idsubst; /* entry's number in DictThesaurus->subst */
1823
uint16 posinsubst; /* pos info in entry */
@@ -94,7 +99,7 @@ newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst
9499
}
95100

96101
static void
97-
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst ) {
102+
addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis ) {
98103
static int nres=0;
99104
static int ntres = 0;
100105
TheSubstitute *ptr;
@@ -138,7 +143,10 @@ addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16
138143
ptr->res[ nres ].lexeme[e-b] = '\0';
139144

140145
ptr->res[ nres ].nvariant = nwrd;
141-
ptr->res[ nres ].flags = TSL_ADDPOS;
146+
if ( useasis )
147+
ptr->res[ nres ].flags = DT_USEASIS;
148+
else
149+
ptr->res[ nres ].flags = 0;
142150

143151
ptr->res[ ++nres ].lexeme = NULL;
144152
}
@@ -154,6 +162,7 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
154162
char str[BUFSIZ];
155163
int lineno=0;
156164
uint16 idsubst = 0;
165+
bool useasis=false;
157166

158167
fh = fopen(to_absfilename(filename), "r");
159168
if (!fh)
@@ -196,13 +205,24 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
196205
state = TR_WAITLEX;
197206
}
198207
} else if ( state == TR_WAITSUBS ) {
199-
if ( !t_isspace(ptr) ) {
208+
if ( t_iseq(ptr, '*') ) {
209+
useasis = true;
210+
state = TR_INSUBS;
211+
beginwrd = ptr + pg_mblen(ptr);
212+
} else if ( t_iseq(ptr, '\\') ) {
213+
useasis = false;
214+
state = TR_INSUBS;
215+
beginwrd = ptr + pg_mblen(ptr);
216+
} else if ( !t_isspace(ptr) ) {
217+
useasis = false;
200218
beginwrd = ptr;
201219
state = TR_INSUBS;
202220
}
203221
} else if ( state == TR_INSUBS ) {
204222
if ( t_isspace(ptr) ) {
205-
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
223+
if ( ptr == beginwrd )
224+
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
225+
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
206226
state = TR_WAITSUBS;
207227
}
208228
} else
@@ -211,8 +231,11 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
211231
ptr += pg_mblen(ptr);
212232
}
213233

214-
if ( state == TR_INSUBS )
215-
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst );
234+
if ( state == TR_INSUBS ) {
235+
if ( ptr == beginwrd )
236+
elog(ERROR, "Thesaurus: Unexpected end of line or lexeme at %d line", lineno);
237+
addWrd( d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis );
238+
}
216239

217240
idsubst++;
218241

@@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) {
319342
elog(ERROR,"Out of memory");
320343

321344
for(i=0;i<d->nwrds;i++) {
322-
TSLexeme *ptr = (TSLexeme*) DatumGetPointer(
345+
TSLexeme *ptr;
346+
347+
ptr = (TSLexeme*) DatumGetPointer(
323348
FunctionCall4(
324349
&(d->subdict.lexize_info),
325350
PointerGetDatum(d->subdict.dictionary),
@@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) {
331356

332357
if ( !(ptr && ptr->lexeme) ) {
333358
if ( !ptr )
334-
elog(ERROR,"Thesaurus: word '%s' isn't recognized by subdictionary", d->wrds[i].lexeme);
359+
elog(ERROR,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)",
360+
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1 );
335361
else
336-
elog(NOTICE,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word", d->wrds[i].lexeme);
362+
elog(NOTICE,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)",
363+
d->wrds[i].lexeme, d->wrds[i].entries->idsubst+1);
337364

338365
newwrds = addCompiledLexeme( newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
339366
} else {
@@ -413,17 +440,25 @@ compileTheSubstitute(DictThesaurus *d) {
413440
inptr = rem;
414441

415442
while( inptr && inptr->lexeme ) {
416-
TSLexeme *reml, *lexized = (TSLexeme*) DatumGetPointer(
417-
FunctionCall4(
418-
&(d->subdict.lexize_info),
419-
PointerGetDatum(d->subdict.dictionary),
420-
PointerGetDatum(inptr->lexeme),
421-
Int32GetDatum(strlen(inptr->lexeme)),
422-
PointerGetDatum(NULL)
423-
)
424-
);
443+
TSLexeme *lexized, tmplex[2];
444+
445+
if ( inptr->flags & DT_USEASIS ) { /* do not lexize */
446+
tmplex[0] = *inptr;
447+
tmplex[0].flags = 0;
448+
tmplex[1].lexeme = NULL;
449+
lexized = tmplex;
450+
} else {
451+
lexized = (TSLexeme*) DatumGetPointer(
452+
FunctionCall4(
453+
&(d->subdict.lexize_info),
454+
PointerGetDatum(d->subdict.dictionary),
455+
PointerGetDatum(inptr->lexeme),
456+
Int32GetDatum(strlen(inptr->lexeme)),
457+
PointerGetDatum(NULL)
458+
)
459+
);
460+
}
425461

426-
reml = lexized;
427462
if ( lexized && lexized->lexeme ) {
428463
int toset = (lexized->lexeme && outptr != d->subst[i].res ) ? (outptr - d->subst[i].res) : -1;
429464

@@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) {
447482

448483
if ( toset > 0)
449484
d->subst[i].res[toset].flags |= TSL_ADDPOS;
485+
} else if ( lexized ) {
486+
elog(NOTICE,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)", inptr->lexeme, i+1);
450487
} else {
451-
elog(NOTICE,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, ignored", inptr->lexeme);
488+
elog(ERROR,"Thesaurus: word '%s' in substition isn't recognized (rule %d)", inptr->lexeme, i+1);
452489
}
453490

454491
if ( inptr->lexeme )
@@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) {
457494
}
458495

459496
if ( outptr == d->subst[i].res )
460-
elog(ERROR,"Thesaurus: all words in subsitution aren't recognized by subdictionary");
497+
elog(ERROR,"Thesaurus: all words in subsitution are stop word (rule %d)", i+1);
461498

462499
d->subst[i].reslen = outptr - d->subst[i].res;
463500

@@ -717,7 +754,7 @@ thesaurus_lexize(PG_FUNCTION_ARGS)
717754

718755
infos = (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex);
719756
for(i=0;i<nlex;i++)
720-
if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )
757+
if ( (infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL )
721758
break;
722759

723760
if ( i<nlex ) {

contrib/tsearch2/thesaurus

+9-7
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
#
22
# Theasurus config file. Character ':' splits
3-
# string to part:
4-
# to be substituted string
5-
# substituting string
3+
# string to part, example:
4+
# sample-words : substitute-words
65
#
6+
# Any substitute-word can be marked by preceding '*' character,
7+
# which means do not lexize this word
8+
# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
79

8-
#one two three : 123
9-
#one two : 12
10-
#one : 1
11-
#two : 2
10+
#one two three : *123
11+
#one two : *12
12+
#one : *1
13+
#two : *2
1214

1315
#foo bar : blah blah
1416
#f bar : fbar

0 commit comments

Comments
 (0)