postgrespro
diff --git a/‎src/backend/tsearch/ts_locale.c
+121-64 b/‎src/backend/tsearch/ts_locale.c
+121-64
diff --git a/‎src/backend/tsearch/ts_utils.c
+2-2 b/‎src/backend/tsearch/ts_utils.c
+2-2
diff --git a/‎src/backend/tsearch/wparser_def.c
+3-3 b/‎src/backend/tsearch/wparser_def.c
+3-3
@@ -1,13 +1,13 @@
 /*-------------------------------------------------------------------------
  *
  * ts_locale.c
- *		locale compatiblility layer for tsearch
+ *		locale compatibility layer for tsearch
  *
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.3 2007/11/09 22:37:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -16,113 +16,174 @@
 #include "tsearch/ts_locale.h"
 #include "tsearch/ts_public.h"
 
-#ifdef TS_USE_WIDE
 
-#ifdef WIN32
+#ifdef TS_USE_WIDE
 
+/*
+ * wchar2char --- convert wide characters to multibyte format
+ *
+ * This has the same API as the standard wcstombs() function; in particular,
+ * tolen is the maximum number of bytes to store at *to, and *from should be
+ * zero-terminated.  The output will be zero-terminated iff there is room.
+ */
 size_t
-wchar2char(char *to, const wchar_t *from, size_t len)
+wchar2char(char *to, const wchar_t *from, size_t tolen)
 {
-	if (len == 0)
+	if (tolen == 0)
 		return 0;
 
+#ifdef WIN32
 	if (GetDatabaseEncoding() == PG_UTF8)
 	{
 		int			r;
 
-		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
+		r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
 								NULL, NULL);
 
-		if (r == 0)
-			ereport(ERROR,
-					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-					 errmsg("UTF-16 to UTF-8 translation failed: %lu",
-							GetLastError())));
-		Assert(r <= len);
+		if (r <= 0)
+			return (size_t) -1;
+
+		Assert(r <= tolen);
 
-		return r;
+		/* Microsoft counts the zero terminator in the result */
+		return r-1;
 	}
+#endif   /* WIN32 */
 
-	return wcstombs(to, from, len);
+	return wcstombs(to, from, tolen);
 }
-#endif   /* WIN32 */
 
+/*
+ * char2wchar --- convert multibyte characters to wide characters
+ *
+ * This has almost the API of mbstowcs(), except that *from need not be
+ * null-terminated; instead, the number of input bytes is specified as
+ * fromlen.  Also, we ereport() rather than returning -1 for invalid
+ * input encoding.  tolen is the maximum number of wchar_t's to store at *to.
+ * The output will be zero-terminated iff there is room.
+ */
 size_t
-char2wchar(wchar_t *to, const char *from, size_t len)
+char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
 {
-	if (len == 0)
+	if (tolen == 0)
 		return 0;
 
 #ifdef WIN32
 	if (GetDatabaseEncoding() == PG_UTF8)
 	{
 		int			r;
 
-		r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
+		r = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen);
 
-		if (!r)
+		if (r <= 0)
 		{
-			pg_verifymbstr(from, len, false);
+			pg_verifymbstr(from, fromlen, false);
 			ereport(ERROR,
 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
 					 errmsg("invalid multibyte character for locale"),
 					 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
 		}
 
-		Assert(r <= len);
+		Assert(r <= tolen);
 
-		return r;
+		/* Microsoft counts the zero terminator in the result */
+		return r-1;
 	}
-	else
 #endif   /* WIN32 */
+
 	if (lc_ctype_is_c())
 	{
 		/*
 		 * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
 		 * allocated with sufficient space
 		 */
-		return pg_mb2wchar_with_len(from, (pg_wchar *) to, len);
+		return pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen);
 	}
 	else
 	{
 		/*
-		 * mbstowcs require ending '\0'
+		 * mbstowcs requires ending '\0'
 		 */
-		char	   *str = pnstrdup(from, len);
-		size_t		tolen;
+		char	   *str = pnstrdup(from, fromlen);
+		size_t		result;
+
+		result = mbstowcs(to, str, tolen);
 
-		tolen = mbstowcs(to, str, len);
 		pfree(str);
 
-		return tolen;
+		if (result == (size_t) -1)
+		{
+			pg_verifymbstr(from, fromlen, false);
+			ereport(ERROR,
+					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+					 errmsg("invalid multibyte character for locale"),
+					 errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
+		}
+
+		if (result < tolen)
+			to[result] = 0;
+
+		return result;
 	}
 }
 
+
 int
-_t_isalpha(const char *ptr)
+t_isdigit(const char *ptr)
 {
+	int			clen = pg_mblen(ptr);
 	wchar_t		character[2];
 
-	if (lc_ctype_is_c())
+	if (clen == 1 || lc_ctype_is_c())
+		return isdigit(TOUCHAR(ptr));
+
+	char2wchar(character, 2, ptr, clen);
+
+	return iswdigit((wint_t) character[0]);
+}
+
+int
+t_isspace(const char *ptr)
+{
+	int			clen = pg_mblen(ptr);
+	wchar_t		character[2];
+
+	if (clen == 1 || lc_ctype_is_c())
+		return isspace(TOUCHAR(ptr));
+
+	char2wchar(character, 2, ptr, clen);
+
+	return iswspace((wint_t) character[0]);
+}
+
+int
+t_isalpha(const char *ptr)
+{
+	int			clen = pg_mblen(ptr);
+	wchar_t		character[2];
+
+	if (clen == 1 || lc_ctype_is_c())
 		return isalpha(TOUCHAR(ptr));
 
-	char2wchar(character, ptr, 1);
+	char2wchar(character, 2, ptr, clen);
 
-	return iswalpha((wint_t) *character);
+	return iswalpha((wint_t) character[0]);
 }
 
 int
-_t_isprint(const char *ptr)
+t_isprint(const char *ptr)
 {
+	int			clen = pg_mblen(ptr);
 	wchar_t		character[2];
 
-	if (lc_ctype_is_c())
+	if (clen == 1 || lc_ctype_is_c())
 		return isprint(TOUCHAR(ptr));
 
-	char2wchar(character, ptr, 1);
+	char2wchar(character, 2, ptr, clen);
 
-	return iswprint((wint_t) *character);
+	return iswprint((wint_t) character[0]);
 }
+
 #endif   /* TS_USE_WIDE */
 
 
@@ -168,19 +229,27 @@ t_readline(FILE *fp)
 	return recoded;
 }
 
+/*
+ * lowerstr --- fold null-terminated string to lower case
+ *
+ * Returned string is palloc'd
+ */
 char *
-lowerstr(char *str)
+lowerstr(const char *str)
 {
 	return lowerstr_with_len(str, strlen(str));
 }
 
 /*
+ * lowerstr_with_len --- fold string to lower case
+ *
+ * Input string need not be null-terminated.
+ *
  * Returned string is palloc'd
  */
 char *
-lowerstr_with_len(char *str, int len)
+lowerstr_with_len(const char *str, int len)
 {
-	char	   *ptr = str;
 	char	   *out;
 
 	if (len == 0)
@@ -202,23 +271,13 @@ lowerstr_with_len(char *str, int len)
 
 		/*
 		 * alloc number of wchar_t for worst case, len contains number of
-		 * bytes <= number of characters and alloc 1 wchar_t for 0, because
-		 * wchar2char(wcstombs in really) wants zero-terminated string
+		 * bytes >= number of characters and alloc 1 wchar_t for 0, because
+		 * wchar2char wants zero-terminated string
 		 */
 		wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
 
-		/*
-		 * str SHOULD be cstring, so wlen contains number of converted
-		 * character
-		 */
-		wlen = char2wchar(wstr, str, len);
-		if (wlen < 0)
-			ereport(ERROR,
-					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-			  errmsg("translation failed from server encoding to wchar_t")));
-
+		wlen = char2wchar(wstr, len+1, str, len);
 		Assert(wlen <= len);
-		wstr[wlen] = 0;
 
 		while (*wptr)
 		{
@@ -229,31 +288,29 @@ lowerstr_with_len(char *str, int len)
 		/*
 		 * Alloc result string for worst case + '\0'
 		 */
-		len = sizeof(char) * pg_database_encoding_max_length() *(wlen + 1);
+		len = pg_database_encoding_max_length() * wlen + 1;
 		out = (char *) palloc(len);
 
-		/*
-		 * wlen now is number of bytes which is always >= number of characters
-		 */
 		wlen = wchar2char(out, wstr, len);
+
 		pfree(wstr);
 
 		if (wlen < 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-					 errmsg("translation failed from wchar_t to server encoding %d", errno)));
-		Assert(wlen <= len);
-		out[wlen] = '\0';
+					 errmsg("translation from wchar_t to server encoding failed: %m")));
+		Assert(wlen < len);
 	}
 	else
-#endif
+#endif   /* TS_USE_WIDE */
 	{
+		const char *ptr = str;
 		char	   *outptr;
 
 		outptr = out = (char *) palloc(sizeof(char) * (len + 1));
-		while (*ptr && ptr - str < len)
+		while ((ptr - str) < len && *ptr)
 		{
-			*outptr++ = tolower(*(unsigned char *) ptr);
+			*outptr++ = tolower(TOUCHAR(ptr));
 			ptr++;
 		}
 		*outptr = '\0';
 
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.4 2007/09/04 02:16:56 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.5 2007/11/09 22:37:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -75,7 +75,7 @@ comparestr(const void *a, const void *b)
  * or palloc a new version.
  */
 void
-readstoplist(const char *fname, StopList *s, char *(*wordop) (char *))
+readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
 {
 	char	  **stop = NULL;
 
 
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.8 2007/11/09 22:37:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -294,12 +294,12 @@ TParserInit(char *str, int len)
 	/*
 	 * Use wide char code only when max encoding length > 1.
 	 */
-
 	if (prs->charmaxlen > 1)
 	{
 		prs->usewide = true;
 		prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
-		prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
+		prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
+								  prs->str, prs->lenstr);
 	}
 	else
 #endif
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`*`
`8`	`8`	`*`
`9`	`9`	`* IDENTIFICATION`
`10`		`- * $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.4 2007/09/04 02:16:56 tgl Exp $`
	`10`	`+ * $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.5 2007/11/09 22:37:35 tgl Exp $`
`11`	`11`	`*`
`12`	`12`	`*-------------------------------------------------------------------------`
`13`	`13`	`*/`
`@@ -75,7 +75,7 @@ comparestr(const void a, const void b)`
`75`	`75`	`* or palloc a new version.`
`76`	`76`	`*/`
`77`	`77`	`void`
`78`		`-readstoplist(const char fname, StopList s, char (wordop) (char *))`
	`78`	`+readstoplist(const char fname, StopList s, char (wordop) (const char *))`
`79`	`79`	`{`
`80`	`80`	`char **stop = NULL;`
`81`	`81`