Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 5955945

Browse files
committed
Support 3 and 4-byte unicode characters.
John Hansen
1 parent f4c4f1c commit 5955945

File tree

3 files changed

+76
-40
lines changed

3 files changed

+76
-40
lines changed

src/backend/utils/mb/conv.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Portions Copyright (c) 1994, Regents of the University of California
77
*
88
* IDENTIFICATION
9-
* $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.52 2005/03/07 04:30:52 momjian Exp $
9+
* $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.53 2005/06/15 00:15:08 momjian Exp $
1010
*
1111
*-------------------------------------------------------------------------
1212
*/
@@ -361,12 +361,19 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
361361
iutf = *utf++ << 8;
362362
iutf |= *utf++;
363363
}
364-
else
364+
else if (l == 3)
365365
{
366366
iutf = *utf++ << 16;
367367
iutf |= *utf++ << 8;
368368
iutf |= *utf++;
369369
}
370+
else if (l == 4)
371+
{
372+
iutf = *utf++ << 24;
373+
iutf |= *utf++ << 16;
374+
iutf |= *utf++ << 8;
375+
iutf |= *utf++;
376+
}
370377
p = bsearch(&iutf, map, size,
371378
sizeof(pg_utf_to_local), compare1);
372379
if (p == NULL)

src/backend/utils/mb/wchar.c

Lines changed: 64 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* conversion functions between pg_wchar and multibyte streams.
33
* Tatsuo Ishii
4-
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.43 2005/03/14 18:31:20 momjian Exp $
4+
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.44 2005/06/15 00:15:08 momjian Exp $
55
*
66
* WIN1250 client encoding updated by Pavel Behal
77
*
@@ -406,8 +406,14 @@ pg_utf_mblen(const unsigned char *s)
406406
len = 1;
407407
else if ((*s & 0xe0) == 0xc0)
408408
len = 2;
409-
else if ((*s & 0xe0) == 0xe0)
410-
len = 3;
409+
else if ((*s & 0xf0) == 0xe0)
410+
len = 3;
411+
else if ((*s & 0xf8) == 0xf0)
412+
len = 4;
413+
else if ((*s & 0xfc) == 0xf8)
414+
len = 5;
415+
else if ((*s & 0xfe) == 0xfc)
416+
len = 6;
411417
return (len);
412418
}
413419

@@ -721,7 +727,7 @@ pg_wchar_tbl pg_wchar_table[] = {
721727
{pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */
722728
{pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */
723729
{pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */
724-
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UTF8 */
730+
{pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4}, /* 6; PG_UTF8 */
725731
{pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
726732
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */
727733
{pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */
@@ -800,6 +806,31 @@ pg_encoding_max_length(int encoding)
800806

801807
#ifndef FRONTEND
802808

809+
bool pg_utf8_islegal(const unsigned char *source, int length) {
810+
unsigned char a;
811+
const unsigned char *srcptr = source+length;
812+
switch (length) {
813+
default: return false;
814+
/* Everything else falls through when "true"... */
815+
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
816+
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
817+
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
818+
switch (*source) {
819+
/* no fall-through in this inner switch */
820+
case 0xE0: if (a < 0xA0) return false; break;
821+
case 0xED: if (a > 0x9F) return false; break;
822+
case 0xF0: if (a < 0x90) return false; break;
823+
case 0xF4: if (a > 0x8F) return false; break;
824+
default: if (a < 0x80) return false;
825+
}
826+
827+
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
828+
}
829+
if (*source > 0xF4) return false;
830+
return true;
831+
}
832+
833+
803834
/*
804835
* Verify mbstr to make sure that it has a valid character sequence.
805836
* mbstr is not necessarily NULL terminated; length of mbstr is
@@ -823,51 +854,47 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
823854

824855
while (len > 0 && *mbstr)
825856
{
826-
/* special UTF8 check */
827-
if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0)
828-
{
829-
if (noError)
830-
return false;
831-
ereport(ERROR,
832-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
833-
errmsg("Unicode characters greater than or equal to 0x10000 are not supported")));
834-
}
835-
836857
l = pg_mblen(mbstr);
837-
838-
for (i = 1; i < l; i++)
839-
{
840-
/*
841-
* we expect that every multibyte char consists of bytes
842-
* having the 8th bit set
843-
*/
844-
if (i >= len || (mbstr[i] & 0x80) == 0)
858+
859+
/* special UTF-8 check */
860+
if (encoding == PG_UTF8) {
861+
if(!pg_utf8_islegal(mbstr,l)) {
862+
if (noError) return false;
863+
ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near byte %c",*mbstr)));
864+
}
865+
} else {
866+
for (i = 1; i < l; i++)
845867
{
846-
char buf[8 * 2 + 1];
847-
char *p = buf;
848-
int j,
868+
/*
869+
* we expect that every multibyte char consists of bytes
870+
* having the 8th bit set
871+
*/
872+
if (i >= len || (mbstr[i] & 0x80) == 0)
873+
{
874+
char buf[8 * 2 + 1];
875+
char *p = buf;
876+
int j,
849877
jlimit;
850878

851-
if (noError)
852-
return false;
879+
if (noError)
880+
return false;
853881

854-
jlimit = Min(l, len);
855-
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
882+
jlimit = Min(l, len);
883+
jlimit = Min(jlimit, 8); /* prevent buffer overrun */
856884

857-
for (j = 0; j < jlimit; j++)
858-
p += sprintf(p, "%02x", mbstr[j]);
885+
for (j = 0; j < jlimit; j++)
886+
p += sprintf(p, "%02x", mbstr[j]);
859887

860-
ereport(ERROR,
861-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
862-
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
863-
GetDatabaseEncodingName(), buf)));
888+
ereport(ERROR,
889+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
890+
errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
891+
GetDatabaseEncodingName(), buf)));
892+
}
864893
}
865894
}
866-
867895
len -= l;
868896
mbstr += l;
869897
}
870-
871898
return true;
872899
}
873900

src/include/mb/pg_wchar.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.58 2005/03/14 18:31:24 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/src/include/mb/pg_wchar.h,v 1.59 2005/06/15 00:15:08 momjian Exp $ */
22

33
#ifndef PG_WCHAR_H
44
#define PG_WCHAR_H
@@ -340,4 +340,6 @@ extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
340340
extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
341341
extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
342342

343+
extern bool pg_utf8_islegal(const unsigned char *source, int length);
344+
343345
#endif /* PG_WCHAR_H */

0 commit comments

Comments
 (0)