Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit dde9457

Browse files
committed
Fixing and improve compound word support. This changes cannot be applied to
previous version iwthout recreating tsvector fields... Thanks to Alexander Presber <aljoscha@weisshuhn.de> to discover a problem.
1 parent 21e2544 commit dde9457

File tree

1 file changed

+75
-56
lines changed

1 file changed

+75
-56
lines changed

contrib/tsearch2/ispell/spell.c

+75-56
Original file line numberDiff line numberDiff line change
@@ -737,9 +737,9 @@ NISortAffixes(IspellDict * Conf)
737737
{
738738
if (firstsuffix < 0)
739739
firstsuffix = i;
740-
if (Affix->flagflags & FF_COMPOUNDONLYAFX)
740+
if ((Affix->flagflags & FF_COMPOUNDONLYAFX) && Affix->replen>0 )
741741
{
742-
if (!ptr->affix ||
742+
if (ptr == Conf->CompoundAffix ||
743743
strbncmp((const unsigned char *) (ptr - 1)->affix,
744744
(const unsigned char *) Affix->repl,
745745
(ptr - 1)->len))
@@ -1024,17 +1024,31 @@ typedef struct SplitVar
10241024
} SplitVar;
10251025

10261026
static int
1027-
CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len)
1027+
CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len, bool CheckInPlace)
10281028
{
1029-
while ((*ptr)->affix)
1030-
{
1031-
if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
1029+
if ( CheckInPlace ) {
1030+
while ((*ptr)->affix)
1031+
{
1032+
if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
1033+
{
1034+
len = (*ptr)->len;
1035+
(*ptr)++;
1036+
return len;
1037+
}
1038+
(*ptr)++;
1039+
}
1040+
} else {
1041+
char *affbegin;
1042+
while ((*ptr)->affix)
10321043
{
1033-
len = (*ptr)->len;
1044+
if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
1045+
{
1046+
len = (*ptr)->len + (affbegin-word);
1047+
(*ptr)++;
1048+
return len;
1049+
}
10341050
(*ptr)++;
1035-
return len;
10361051
}
1037-
(*ptr)++;
10381052
}
10391053
return 0;
10401054
}
@@ -1078,26 +1092,11 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
10781092
memset(notprobed, 1, wordlen);
10791093
var = CopyVar(orig, 1);
10801094

1081-
while (node && level < wordlen)
1095+
while (level < wordlen)
10821096
{
1083-
StopLow = node->data;
1084-
StopHigh = node->data + node->length;
1085-
while (StopLow < StopHigh)
1086-
{
1087-
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1088-
if (StopMiddle->val == ((uint8 *) (word))[level])
1089-
break;
1090-
else if (StopMiddle->val < ((uint8 *) (word))[level])
1091-
StopLow = StopMiddle + 1;
1092-
else
1093-
StopHigh = StopMiddle;
1094-
}
1095-
if (StopLow >= StopHigh)
1096-
break;
1097-
1098-
/* find word with epenthetic */
1097+
/* find word with epenthetic or/and compound suffix */
10991098
caff = Conf->CompoundAffix;
1100-
while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level)) > 0)
1099+
while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) > 0)
11011100
{
11021101
/*
11031102
* there is one of compound suffixes, so check word for existings
@@ -1143,41 +1142,61 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
11431142
}
11441143
}
11451144

1146-
/* find infinitive */
1147-
if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level])
1145+
if ( !node )
1146+
break;
1147+
1148+
StopLow = node->data;
1149+
StopHigh = node->data + node->length;
1150+
while (StopLow < StopHigh)
11481151
{
1149-
/* ok, we found full compoundallowed word */
1150-
if (level > minpos)
1152+
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1153+
if (StopMiddle->val == ((uint8 *) (word))[level])
1154+
break;
1155+
else if (StopMiddle->val < ((uint8 *) (word))[level])
1156+
StopLow = StopMiddle + 1;
1157+
else
1158+
StopHigh = StopMiddle;
1159+
}
1160+
1161+
if (StopLow < StopHigh) {
1162+
1163+
/* find infinitive */
1164+
if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level])
11511165
{
1152-
/* and its length more than minimal */
1153-
if (wordlen == level + 1)
1154-
{
1155-
/* well, it was last word */
1156-
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
1157-
var->nstem++;
1158-
pfree(notprobed);
1159-
return var;
1160-
}
1161-
else
1166+
/* ok, we found full compoundallowed word */
1167+
if (level > minpos)
11621168
{
1163-
/* then we will search more big word at the same point */
1164-
SplitVar *ptr = var;
1165-
1166-
while (ptr->next)
1167-
ptr = ptr->next;
1168-
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
1169-
/* we can find next word */
1170-
level++;
1171-
var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
1172-
var->nstem++;
1173-
node = Conf->Dictionary;
1174-
startpos = level;
1175-
continue;
1169+
/* and its length more than minimal */
1170+
if (wordlen == level + 1)
1171+
{
1172+
/* well, it was last word */
1173+
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
1174+
var->nstem++;
1175+
pfree(notprobed);
1176+
return var;
1177+
}
1178+
else
1179+
{
1180+
/* then we will search more big word at the same point */
1181+
SplitVar *ptr = var;
1182+
1183+
while (ptr->next)
1184+
ptr = ptr->next;
1185+
ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
1186+
/* we can find next word */
1187+
level++;
1188+
var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
1189+
var->nstem++;
1190+
node = Conf->Dictionary;
1191+
startpos = level;
1192+
continue;
1193+
}
11761194
}
11771195
}
1178-
}
1196+
node = StopMiddle->node;
1197+
} else
1198+
node = NULL;
11791199
level++;
1180-
node = StopMiddle->node;
11811200
}
11821201

11831202
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);

0 commit comments

Comments
 (0)