Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 51e78ab

Browse files
committed
Avoid use of sscanf() to parse ispell dictionary files.
It turns out that on FreeBSD-derived platforms (including OS X), the *scanf() family of functions is pretty much brain-dead about multibyte characters. In particular it will apply isspace() to individual bytes of input even when those bytes are part of a multibyte character, thus allowing false recognition of a field-terminating space. We appear to have little alternative other than instituting a coding rule that *scanf() is not to be used if the input string might contain multibyte characters. (There was some discussion of relying on "%ls", but that probably just moves the portability problem somewhere else, and besides it doesn't fully prevent BSD *scanf() from using isspace().) This patch is a down payment on that: it gets rid of use of sscanf() to parse ispell dictionary files, which are certainly at great risk of having a problem. The code is cleaner this way anyway, though a bit longer. In passing, improve a few comments. Report and patch by Artur Zakirov, reviewed and somewhat tweaked by me. Back-patch to all supported branches.
1 parent c5e9b77 commit 51e78ab

File tree

1 file changed

+153
-13
lines changed

1 file changed

+153
-13
lines changed

src/backend/tsearch/spell.c

+153-13
Original file line numberDiff line numberDiff line change
@@ -457,13 +457,149 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
457457
Conf->naffixes++;
458458
}
459459

460+
461+
/* Parsing states for parse_affentry() and friends */
460462
#define PAE_WAIT_MASK 0
461-
#define PAE_INMASK 1
463+
#define PAE_INMASK 1
462464
#define PAE_WAIT_FIND 2
463-
#define PAE_INFIND 3
465+
#define PAE_INFIND 3
464466
#define PAE_WAIT_REPL 4
465-
#define PAE_INREPL 5
467+
#define PAE_INREPL 5
468+
#define PAE_WAIT_TYPE 6
469+
#define PAE_WAIT_FLAG 7
466470

471+
/*
472+
* Parse next space-separated field of an .affix file line.
473+
*
474+
* *str is the input pointer (will be advanced past field)
475+
* next is where to copy the field value to, with null termination
476+
*
477+
* The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
478+
*
479+
* Returns TRUE if we found a field, FALSE if not.
480+
*/
481+
static bool
482+
get_nextfield(char **str, char *next)
483+
{
484+
int state = PAE_WAIT_MASK;
485+
int avail = BUFSIZ;
486+
487+
while (**str)
488+
{
489+
if (state == PAE_WAIT_MASK)
490+
{
491+
if (t_iseq(*str, '#'))
492+
return false;
493+
else if (!t_isspace(*str))
494+
{
495+
int clen = pg_mblen(*str);
496+
497+
if (clen < avail)
498+
{
499+
COPYCHAR(next, *str);
500+
next += clen;
501+
avail -= clen;
502+
}
503+
state = PAE_INMASK;
504+
}
505+
}
506+
else /* state == PAE_INMASK */
507+
{
508+
if (t_isspace(*str))
509+
{
510+
*next = '\0';
511+
return true;
512+
}
513+
else
514+
{
515+
int clen = pg_mblen(*str);
516+
517+
if (clen < avail)
518+
{
519+
COPYCHAR(next, *str);
520+
next += clen;
521+
avail -= clen;
522+
}
523+
}
524+
}
525+
*str += pg_mblen(*str);
526+
}
527+
528+
*next = '\0';
529+
530+
return (state == PAE_INMASK); /* OK if we got a nonempty field */
531+
}
532+
533+
/*
534+
* Parses entry of an .affix file of MySpell or Hunspell format.
535+
*
536+
* An .affix file entry has the following format:
537+
* - header
538+
* <type> <flag> <cross_flag> <flag_count>
539+
* - fields after header:
540+
* <type> <flag> <find> <replace> <mask>
541+
*
542+
* str is the input line
543+
* field values are returned to type etc, which must be buffers of size BUFSIZ.
544+
*
545+
* Returns number of fields found; any omitted fields are set to empty strings.
546+
*/
547+
static int
548+
parse_ooaffentry(char *str, char *type, char *flag, char *find,
549+
char *repl, char *mask)
550+
{
551+
int state = PAE_WAIT_TYPE;
552+
int fields_read = 0;
553+
bool valid = false;
554+
555+
*type = *flag = *find = *repl = *mask = '\0';
556+
557+
while (*str)
558+
{
559+
switch (state)
560+
{
561+
case PAE_WAIT_TYPE:
562+
valid = get_nextfield(&str, type);
563+
state = PAE_WAIT_FLAG;
564+
break;
565+
case PAE_WAIT_FLAG:
566+
valid = get_nextfield(&str, flag);
567+
state = PAE_WAIT_FIND;
568+
break;
569+
case PAE_WAIT_FIND:
570+
valid = get_nextfield(&str, find);
571+
state = PAE_WAIT_REPL;
572+
break;
573+
case PAE_WAIT_REPL:
574+
valid = get_nextfield(&str, repl);
575+
state = PAE_WAIT_MASK;
576+
break;
577+
case PAE_WAIT_MASK:
578+
valid = get_nextfield(&str, mask);
579+
state = -1; /* force loop exit */
580+
break;
581+
default:
582+
elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
583+
state);
584+
break;
585+
}
586+
if (valid)
587+
fields_read++;
588+
else
589+
break; /* early EOL */
590+
if (state < 0)
591+
break; /* got all fields */
592+
}
593+
594+
return fields_read;
595+
}
596+
597+
/*
598+
* Parses entry of an .affix file of Ispell format
599+
*
600+
* An .affix file entry has the following format:
601+
* <mask> > [-<find>,]<replace>
602+
*/
467603
static bool
468604
parse_affentry(char *str, char *mask, char *find, char *repl)
469605
{
@@ -618,8 +754,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
618754
int flag = 0;
619755
char flagflags = 0;
620756
tsearch_readline_state trst;
621-
int scanread = 0;
622-
char scanbuf[BUFSIZ];
623757
char *recoded;
624758

625759
/* read file to find any flag */
@@ -682,8 +816,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
682816
}
683817
tsearch_readline_end(&trst);
684818

685-
sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
686-
687819
if (!tsearch_readline_begin(&trst, filename))
688820
ereport(ERROR,
689821
(errcode(ERRCODE_CONFIG_FILE_ERROR),
@@ -692,18 +824,21 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
692824

693825
while ((recoded = tsearch_readline(&trst)) != NULL)
694826
{
827+
int fields_read;
828+
695829
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
696830
goto nextline;
697831

698-
scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
832+
fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
699833

700834
if (ptype)
701835
pfree(ptype);
702836
ptype = lowerstr_ctx(Conf, type);
703-
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
837+
if (fields_read < 4 ||
838+
(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
704839
goto nextline;
705840

706-
if (scanread == 4)
841+
if (fields_read == 4)
707842
{
708843
if (strlen(sflag) != 1)
709844
goto nextline;
@@ -722,9 +857,13 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
722857
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
723858
goto nextline;
724859
prepl = lowerstr_ctx(Conf, repl);
725-
/* affix flag */
860+
/* Find position of '/' in lowercased string "prepl" */
726861
if ((ptr = strchr(prepl, '/')) != NULL)
727862
{
863+
/*
864+
* Here we use non-lowercased string "repl". We need position
865+
* of '/' in "repl".
866+
*/
728867
*ptr = '\0';
729868
ptr = repl + (ptr - prepl) + 1;
730869
while (*ptr)
@@ -800,11 +939,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
800939

801940
if (STRNCMP(pstr, "compoundwords") == 0)
802941
{
942+
/* Find position in lowercased string "pstr" */
803943
s = findchar(pstr, 'l');
804944
if (s)
805945
{
806-
s = recoded + (s - pstr); /* we need non-lowercased
807-
* string */
946+
/* Here we use non-lowercased string "recoded" */
947+
s = recoded + (s - pstr);
808948
while (*s && !t_isspace(s))
809949
s += pg_mblen(s);
810950
while (*s && t_isspace(s))

0 commit comments

Comments
 (0)