Avoid use of sscanf() to parse ispell dictionary files.

tglsfdc · tglsfdc · commit 51e78ab4ff32 · 2016-02-10T19:30:11.000-05:00
It turns out that on FreeBSD-derived platforms (including OS X), the
*scanf() family of functions is pretty much brain-dead about multibyte
characters.  In particular it will apply isspace() to individual bytes
of input even when those bytes are part of a multibyte character, thus
allowing false recognition of a field-terminating space.

We appear to have little alternative other than instituting a coding
rule that *scanf() is not to be used if the input string might contain
multibyte characters.  (There was some discussion of relying on "%ls",
but that probably just moves the portability problem somewhere else,
and besides it doesn't fully prevent BSD *scanf() from using isspace().)

This patch is a down payment on that: it gets rid of use of sscanf()
to parse ispell dictionary files, which are certainly at great risk
of having a problem.  The code is cleaner this way anyway, though
a bit longer.

In passing, improve a few comments.

Report and patch by Artur Zakirov, reviewed and somewhat tweaked by me.
Back-patch to all supported branches.
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
@@ -457,13 +457,149 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
 	Conf->naffixes++;
 }
 
+
+/* Parsing states for parse_affentry() and friends */
 #define PAE_WAIT_MASK	0
-#define PAE_INMASK	1
+#define PAE_INMASK		1
 #define PAE_WAIT_FIND	2
-#define PAE_INFIND	3
+#define PAE_INFIND		3
 #define PAE_WAIT_REPL	4
-#define PAE_INREPL	5
+#define PAE_INREPL		5
+#define PAE_WAIT_TYPE	6
+#define PAE_WAIT_FLAG	7
 
+/*
+ * Parse next space-separated field of an .affix file line.
+ *
+ * *str is the input pointer (will be advanced past field)
+ * next is where to copy the field value to, with null termination
+ *
+ * The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
+ *
+ * Returns TRUE if we found a field, FALSE if not.
+ */
+static bool
+get_nextfield(char **str, char *next)
+{
+	int			state = PAE_WAIT_MASK;
+	int			avail = BUFSIZ;
+
+	while (**str)
+	{
+		if (state == PAE_WAIT_MASK)
+		{
+			if (t_iseq(*str, '#'))
+				return false;
+			else if (!t_isspace(*str))
+			{
+				int			clen = pg_mblen(*str);
+
+				if (clen < avail)
+				{
+					COPYCHAR(next, *str);
+					next += clen;
+					avail -= clen;
+				}
+				state = PAE_INMASK;
+			}
+		}
+		else	/* state == PAE_INMASK */
+		{
+			if (t_isspace(*str))
+			{
+				*next = '\0';
+				return true;
+			}
+			else
+			{
+				int			clen = pg_mblen(*str);
+
+				if (clen < avail)
+				{
+					COPYCHAR(next, *str);
+					next += clen;
+					avail -= clen;
+				}
+			}
+		}
+		*str += pg_mblen(*str);
+	}
+
+	*next = '\0';
+
+	return (state == PAE_INMASK);		/* OK if we got a nonempty field */
+}
+
+/*
+ * Parses entry of an .affix file of MySpell or Hunspell format.
+ *
+ * An .affix file entry has the following format:
+ * - header
+ *	 <type>  <flag>  <cross_flag>  <flag_count>
+ * - fields after header:
+ *	 <type>  <flag>  <find>  <replace>	<mask>
+ *
+ * str is the input line
+ * field values are returned to type etc, which must be buffers of size BUFSIZ.
+ *
+ * Returns number of fields found; any omitted fields are set to empty strings.
+ */
+static int
+parse_ooaffentry(char *str, char *type, char *flag, char *find,
+				 char *repl, char *mask)
+{
+	int			state = PAE_WAIT_TYPE;
+	int			fields_read = 0;
+	bool		valid = false;
+
+	*type = *flag = *find = *repl = *mask = '\0';
+
+	while (*str)
+	{
+		switch (state)
+		{
+			case PAE_WAIT_TYPE:
+				valid = get_nextfield(&str, type);
+				state = PAE_WAIT_FLAG;
+				break;
+			case PAE_WAIT_FLAG:
+				valid = get_nextfield(&str, flag);
+				state = PAE_WAIT_FIND;
+				break;
+			case PAE_WAIT_FIND:
+				valid = get_nextfield(&str, find);
+				state = PAE_WAIT_REPL;
+				break;
+			case PAE_WAIT_REPL:
+				valid = get_nextfield(&str, repl);
+				state = PAE_WAIT_MASK;
+				break;
+			case PAE_WAIT_MASK:
+				valid = get_nextfield(&str, mask);
+				state = -1;		/* force loop exit */
+				break;
+			default:
+				elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
+					 state);
+				break;
+		}
+		if (valid)
+			fields_read++;
+		else
+			break;				/* early EOL */
+		if (state < 0)
+			break;				/* got all fields */
+	}
+
+	return fields_read;
+}
+
+/*
+ * Parses entry of an .affix file of Ispell format
+ *
+ * An .affix file entry has the following format:
+ * <mask>  >  [-<find>,]<replace>
+ */
 static bool
 parse_affentry(char *str, char *mask, char *find, char *repl)
 {
@@ -618,8 +754,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 	int			flag = 0;
 	char		flagflags = 0;
 	tsearch_readline_state trst;
-	int			scanread = 0;
-	char		scanbuf[BUFSIZ];
 	char	   *recoded;
 
 	/* read file to find any flag */
@@ -682,8 +816,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 	}
 	tsearch_readline_end(&trst);
 
-	sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
-
 	if (!tsearch_readline_begin(&trst, filename))
 		ereport(ERROR,
 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
@@ -692,18 +824,21 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 
 	while ((recoded = tsearch_readline(&trst)) != NULL)
 	{
+		int			fields_read;
+
 		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
 			goto nextline;
 
-		scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
+		fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
 
 		if (ptype)
 			pfree(ptype);
 		ptype = lowerstr_ctx(Conf, type);
-		if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
+		if (fields_read < 4 ||
+			(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
 			goto nextline;
 
-		if (scanread == 4)
+		if (fields_read == 4)
 		{
 			if (strlen(sflag) != 1)
 				goto nextline;
@@ -722,9 +857,13 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
 			if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
 				goto nextline;
 			prepl = lowerstr_ctx(Conf, repl);
-			/* affix flag */
+			/* Find position of '/' in lowercased string "prepl" */
 			if ((ptr = strchr(prepl, '/')) != NULL)
 			{
+				/*
+				 * Here we use non-lowercased string "repl". We need position
+				 * of '/' in "repl".
+				 */
 				*ptr = '\0';
 				ptr = repl + (ptr - prepl) + 1;
 				while (*ptr)
@@ -800,11 +939,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 
 		if (STRNCMP(pstr, "compoundwords") == 0)
 		{
+			/* Find position in lowercased string "pstr" */
 			s = findchar(pstr, 'l');
 			if (s)
 			{
-				s = recoded + (s - pstr);		/* we need non-lowercased
-												 * string */
+				/* Here we use non-lowercased string "recoded" */
+				s = recoded + (s - pstr);
 				while (*s && !t_isspace(s))
 					s += pg_mblen(s);
 				while (*s && t_isspace(s))