Support 3 and 4-byte unicode characters.

bmomjian · bmomjian · commit 5955945828d9 · 2005-06-15T00:15:08.000Z
John Hansen
diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.52 2005/03/07 04:30:52 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.53 2005/06/15 00:15:08 momjian Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -361,12 +361,19 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
 			iutf = *utf++ << 8;
 			iutf |= *utf++;
 		}
-		else
+		else if (l == 3)
 		{
 			iutf = *utf++ << 16;
 			iutf |= *utf++ << 8;
 			iutf |= *utf++;
 		}
+		else if (l == 4)
+		{
+			iutf = *utf++ << 24;
+			iutf |= *utf++ << 16;
+			iutf |= *utf++ << 8;
+			iutf |= *utf++;
+		}
 		p = bsearch(&iutf, map, size,
 					sizeof(pg_utf_to_local), compare1);
 		if (p == NULL)
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
@@ -1,7 +1,7 @@
 /*
  * conversion functions between pg_wchar and multibyte streams.
  * Tatsuo Ishii
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.43 2005/03/14 18:31:20 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.44 2005/06/15 00:15:08 momjian Exp $
  *
  * WIN1250 client encoding updated by Pavel Behal
  *
@@ -406,8 +406,14 @@ pg_utf_mblen(const unsigned char *s)
 		len = 1;
 	else if ((*s & 0xe0) == 0xc0)
 		len = 2;
-	else if ((*s & 0xe0) == 0xe0)
-		len = 3;
+        else if ((*s & 0xf0) == 0xe0)
+                len = 3;
+        else if ((*s & 0xf8) == 0xf0)
+                len = 4;
+        else if ((*s & 0xfc) == 0xf8)
+                len = 5;
+        else if ((*s & 0xfe) == 0xfc)
+                len = 6;
 	return (len);
 }
 
@@ -721,7 +727,7 @@ pg_wchar_tbl pg_wchar_table[] = {
 	{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},		/* 3; PG_EUC_KR */
 	{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},		/* 4; PG_EUC_TW */
 	{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},		/* 5; PG_JOHAB */
-	{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3},			/* 6; PG_UTF8 */
+	{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4},		/* 6; PG_UTF8 */
 	{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, 		/* 7; PG_MULE_INTERNAL */
 	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 8; PG_LATIN1 */
 	{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},	/* 9; PG_LATIN2 */
@@ -800,6 +806,31 @@ pg_encoding_max_length(int encoding)
 
 #ifndef FRONTEND
 
+bool pg_utf8_islegal(const unsigned char *source, int length) {
+    unsigned char a;
+    const unsigned char *srcptr = source+length;
+    switch (length) {
+       default: return false;
+        /* Everything else falls through when "true"... */
+       case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+       case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+       case 2: if ((a = (*--srcptr)) > 0xBF) return false;
+        switch (*source) {
+            /* no fall-through in this inner switch */
+            case 0xE0: if (a < 0xA0) return false; break;
+            case 0xED: if (a > 0x9F) return false; break;
+            case 0xF0: if (a < 0x90) return false; break;
+            case 0xF4: if (a > 0x8F) return false; break;
+            default:   if (a < 0x80) return false;
+        }
+
+    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+    }
+    if (*source > 0xF4) return false;
+    return true;
+}
+
+
 /*
  * Verify mbstr to make sure that it has a valid character sequence.
  * mbstr is not necessarily NULL terminated; length of mbstr is
@@ -823,51 +854,47 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
 
 	while (len > 0 && *mbstr)
 	{
-		/* special UTF8 check */
-		if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
-		{
-			if (noError)
-				return false;
-			ereport(ERROR,
-					(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-					 errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
-		}
-
 		l = pg_mblen(mbstr);
-
-		for (i = 1; i < l; i++)
-		{
-			/*
-			 * we expect that every multibyte char consists of bytes
-			 * having the 8th bit set
-			 */
-			if (i >= len || (mbstr[i] & 0x80) == 0)
+		
+		/* special UTF-8 check */
+		if (encoding == PG_UTF8) {
+            		if(!pg_utf8_islegal(mbstr,l)) {
+                    		if (noError) return false;
+				ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near byte %c",*mbstr)));
+			}
+		} else {
+			for (i = 1; i < l; i++)
 			{
-				char		buf[8 * 2 + 1];
-				char	   *p = buf;
-				int			j,
+                        	/*
+                    		* we expect that every multibyte char consists of bytes
+                                * having the 8th bit set
+                                */
+                    		if (i >= len || (mbstr[i] & 0x80) == 0)
+                        	{
+                            		char		buf[8 * 2 + 1];
+                                        char		*p = buf;
+                                        int		j,
 							jlimit;
 
-				if (noError)
-					return false;
+					if (noError)
+						return false;
 
-				jlimit = Min(l, len);
-				jlimit = Min(jlimit, 8);		/* prevent buffer overrun */
+					jlimit = Min(l, len);
+					jlimit = Min(jlimit, 8);		/* prevent buffer overrun */
 
-				for (j = 0; j < jlimit; j++)
-					p += sprintf(p, "%02x", mbstr[j]);
+					for (j = 0; j < jlimit; j++)
+						p += sprintf(p, "%02x", mbstr[j]);
 
-				ereport(ERROR,
-						(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-				errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
-					   GetDatabaseEncodingName(), buf)));
+					ereport(ERROR,
+							(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+					errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
+						GetDatabaseEncodingName(), buf)));
+				}
 			}
 		}
-
 		len -= l;
 		mbstr += l;
 	}
-
 	return true;
 }
 
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
@@ -1,4 +1,4 @@
-/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.58 2005/03/14 18:31:24 momjian Exp $ */
+/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.59 2005/06/15 00:15:08 momjian Exp $ */
 
 #ifndef PG_WCHAR_H
 #define PG_WCHAR_H
@@ -340,4 +340,6 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
 extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
 extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
 
+extern bool pg_utf8_islegal(const unsigned char *source, int length);
+
 #endif   /* PG_WCHAR_H */

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`* Portions Copyright (c) 1994, Regents of the University of California`
`7`	`7`	`*`
`8`	`8`	`* IDENTIFICATION`
`9`		`- * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.52 2005/03/07 04:30:52 momjian Exp $`
	`9`	`+ * $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.53 2005/06/15 00:15:08 momjian Exp $`
`10`	`10`	`*`
`11`	`11`	`*-------------------------------------------------------------------------`
`12`	`12`	`*/`
`@@ -361,12 +361,19 @@ UtfToLocal(unsigned char utf, unsigned char iso,`
`361`	`361`	`iutf = *utf++ << 8;`
`362`	`362`	`iutf \|= *utf++;`
`363`	`363`	`}`
`364`		`- else`
	`364`	`+ else if (l == 3)`
`365`	`365`	`{`
`366`	`366`	`iutf = *utf++ << 16;`
`367`	`367`	`iutf \|= *utf++ << 8;`
`368`	`368`	`iutf \|= *utf++;`
`369`	`369`	`}`
	`370`	`+ else if (l == 4)`
	`371`	`+ {`
	`372`	`+ iutf = *utf++ << 24;`
	`373`	`+ iutf \|= *utf++ << 16;`
	`374`	`+ iutf \|= *utf++ << 8;`
	`375`	`+ iutf \|= *utf++;`
	`376`	`+ }`
`370`	`377`	`p = bsearch(&iutf, map, size,`
`371`	`378`	`sizeof(pg_utf_to_local), compare1);`
`372`	`379`	`if (p == NULL)`