1
- /* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */
1
+ /* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */
2
2
3
3
/*
4
4
* thesaurus
13
13
#include "common.h"
14
14
#include "ts_locale.h"
15
15
16
+ /*
17
+ * Temporay we use TSLexeme.flags for inner use...
18
+ */
19
+ #define DT_USEASIS 0x1000
20
+
16
21
typedef struct LexemeInfo {
17
22
uint16 idsubst ; /* entry's number in DictThesaurus->subst */
18
23
uint16 posinsubst ; /* pos info in entry */
@@ -94,7 +99,7 @@ newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst
94
99
}
95
100
96
101
static void
97
- addWrd ( DictThesaurus * d , char * b , char * e , uint16 idsubst , uint16 nwrd , uint16 posinsubst ) {
102
+ addWrd ( DictThesaurus * d , char * b , char * e , uint16 idsubst , uint16 nwrd , uint16 posinsubst , bool useasis ) {
98
103
static int nres = 0 ;
99
104
static int ntres = 0 ;
100
105
TheSubstitute * ptr ;
@@ -138,7 +143,10 @@ addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16
138
143
ptr -> res [ nres ].lexeme [e - b ] = '\0' ;
139
144
140
145
ptr -> res [ nres ].nvariant = nwrd ;
141
- ptr -> res [ nres ].flags = TSL_ADDPOS ;
146
+ if ( useasis )
147
+ ptr -> res [ nres ].flags = DT_USEASIS ;
148
+ else
149
+ ptr -> res [ nres ].flags = 0 ;
142
150
143
151
ptr -> res [ ++ nres ].lexeme = NULL ;
144
152
}
@@ -154,6 +162,7 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
154
162
char str [BUFSIZ ];
155
163
int lineno = 0 ;
156
164
uint16 idsubst = 0 ;
165
+ bool useasis = false;
157
166
158
167
fh = fopen (to_absfilename (filename ), "r" );
159
168
if (!fh )
@@ -196,13 +205,24 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
196
205
state = TR_WAITLEX ;
197
206
}
198
207
} else if ( state == TR_WAITSUBS ) {
199
- if ( !t_isspace (ptr ) ) {
208
+ if ( t_iseq (ptr , '*' ) ) {
209
+ useasis = true;
210
+ state = TR_INSUBS ;
211
+ beginwrd = ptr + pg_mblen (ptr );
212
+ } else if ( t_iseq (ptr , '\\' ) ) {
213
+ useasis = false;
214
+ state = TR_INSUBS ;
215
+ beginwrd = ptr + pg_mblen (ptr );
216
+ } else if ( !t_isspace (ptr ) ) {
217
+ useasis = false;
200
218
beginwrd = ptr ;
201
219
state = TR_INSUBS ;
202
220
}
203
221
} else if ( state == TR_INSUBS ) {
204
222
if ( t_isspace (ptr ) ) {
205
- addWrd ( d , beginwrd , ptr , idsubst , nwrd ++ , posinsubst );
223
+ if ( ptr == beginwrd )
224
+ elog (ERROR , "Thesaurus: Unexpected end of line or lexeme at %d line" , lineno );
225
+ addWrd ( d , beginwrd , ptr , idsubst , nwrd ++ , posinsubst , useasis );
206
226
state = TR_WAITSUBS ;
207
227
}
208
228
} else
@@ -211,8 +231,11 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
211
231
ptr += pg_mblen (ptr );
212
232
}
213
233
214
- if ( state == TR_INSUBS )
215
- addWrd ( d , beginwrd , ptr , idsubst , nwrd ++ , posinsubst );
234
+ if ( state == TR_INSUBS ) {
235
+ if ( ptr == beginwrd )
236
+ elog (ERROR , "Thesaurus: Unexpected end of line or lexeme at %d line" , lineno );
237
+ addWrd ( d , beginwrd , ptr , idsubst , nwrd ++ , posinsubst , useasis );
238
+ }
216
239
217
240
idsubst ++ ;
218
241
@@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) {
319
342
elog (ERROR ,"Out of memory" );
320
343
321
344
for (i = 0 ;i < d -> nwrds ;i ++ ) {
322
- TSLexeme * ptr = (TSLexeme * ) DatumGetPointer (
345
+ TSLexeme * ptr ;
346
+
347
+ ptr = (TSLexeme * ) DatumGetPointer (
323
348
FunctionCall4 (
324
349
& (d -> subdict .lexize_info ),
325
350
PointerGetDatum (d -> subdict .dictionary ),
@@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) {
331
356
332
357
if ( !(ptr && ptr -> lexeme ) ) {
333
358
if ( !ptr )
334
- elog (ERROR ,"Thesaurus: word '%s' isn't recognized by subdictionary" , d -> wrds [i ].lexeme );
359
+ elog (ERROR ,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)" ,
360
+ d -> wrds [i ].lexeme , d -> wrds [i ].entries -> idsubst + 1 );
335
361
else
336
- elog (NOTICE ,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word" , d -> wrds [i ].lexeme );
362
+ elog (NOTICE ,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)" ,
363
+ d -> wrds [i ].lexeme , d -> wrds [i ].entries -> idsubst + 1 );
337
364
338
365
newwrds = addCompiledLexeme ( newwrds , & nnw , & tnm , NULL , d -> wrds [i ].entries , 0 );
339
366
} else {
@@ -413,17 +440,25 @@ compileTheSubstitute(DictThesaurus *d) {
413
440
inptr = rem ;
414
441
415
442
while ( inptr && inptr -> lexeme ) {
416
- TSLexeme * reml , * lexized = (TSLexeme * ) DatumGetPointer (
417
- FunctionCall4 (
418
- & (d -> subdict .lexize_info ),
419
- PointerGetDatum (d -> subdict .dictionary ),
420
- PointerGetDatum (inptr -> lexeme ),
421
- Int32GetDatum (strlen (inptr -> lexeme )),
422
- PointerGetDatum (NULL )
423
- )
424
- );
443
+ TSLexeme * lexized , tmplex [2 ];
444
+
445
+ if ( inptr -> flags & DT_USEASIS ) { /* do not lexize */
446
+ tmplex [0 ] = * inptr ;
447
+ tmplex [0 ].flags = 0 ;
448
+ tmplex [1 ].lexeme = NULL ;
449
+ lexized = tmplex ;
450
+ } else {
451
+ lexized = (TSLexeme * ) DatumGetPointer (
452
+ FunctionCall4 (
453
+ & (d -> subdict .lexize_info ),
454
+ PointerGetDatum (d -> subdict .dictionary ),
455
+ PointerGetDatum (inptr -> lexeme ),
456
+ Int32GetDatum (strlen (inptr -> lexeme )),
457
+ PointerGetDatum (NULL )
458
+ )
459
+ );
460
+ }
425
461
426
- reml = lexized ;
427
462
if ( lexized && lexized -> lexeme ) {
428
463
int toset = (lexized -> lexeme && outptr != d -> subst [i ].res ) ? (outptr - d -> subst [i ].res ) : -1 ;
429
464
@@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) {
447
482
448
483
if ( toset > 0 )
449
484
d -> subst [i ].res [toset ].flags |= TSL_ADDPOS ;
485
+ } else if ( lexized ) {
486
+ elog (NOTICE ,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)" , inptr -> lexeme , i + 1 );
450
487
} else {
451
- elog (NOTICE ,"Thesaurus: word '%s' isn't recognized by subdictionary or it's a stop-word, ignored " , inptr -> lexeme );
488
+ elog (ERROR ,"Thesaurus: word '%s' in substition isn't recognized (rule %d) " , inptr -> lexeme , i + 1 );
452
489
}
453
490
454
491
if ( inptr -> lexeme )
@@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) {
457
494
}
458
495
459
496
if ( outptr == d -> subst [i ].res )
460
- elog (ERROR ,"Thesaurus: all words in subsitution aren't recognized by subdictionary" );
497
+ elog (ERROR ,"Thesaurus: all words in subsitution are stop word (rule %d)" , i + 1 );
461
498
462
499
d -> subst [i ].reslen = outptr - d -> subst [i ].res ;
463
500
@@ -717,7 +754,7 @@ thesaurus_lexize(PG_FUNCTION_ARGS)
717
754
718
755
infos = (LexemeInfo * * )palloc (sizeof (LexemeInfo * )* nlex );
719
756
for (i = 0 ;i < nlex ;i ++ )
720
- if ( (infos [i ] = findTheLexeme (d , basevar [i ].lexeme )) == NULL )
757
+ if ( (infos [i ] = findTheLexeme (d , basevar [i ].lexeme )) == NULL )
721
758
break ;
722
759
723
760
if ( i < nlex ) {
0 commit comments