1
1
/*
2
2
* conversion functions between pg_wchar and multibyte streams.
3
3
* Tatsuo Ishii
4
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.43 2005/03/14 18:31:20 momjian Exp $
4
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.44 2005/06/15 00:15:08 momjian Exp $
5
5
*
6
6
* WIN1250 client encoding updated by Pavel Behal
7
7
*
@@ -406,8 +406,14 @@ pg_utf_mblen(const unsigned char *s)
406
406
len = 1 ;
407
407
else if ((* s & 0xe0 ) == 0xc0 )
408
408
len = 2 ;
409
- else if ((* s & 0xe0 ) == 0xe0 )
410
- len = 3 ;
409
+ else if ((* s & 0xf0 ) == 0xe0 )
410
+ len = 3 ;
411
+ else if ((* s & 0xf8 ) == 0xf0 )
412
+ len = 4 ;
413
+ else if ((* s & 0xfc ) == 0xf8 )
414
+ len = 5 ;
415
+ else if ((* s & 0xfe ) == 0xfc )
416
+ len = 6 ;
411
417
return (len );
412
418
}
413
419
@@ -721,7 +727,7 @@ pg_wchar_tbl pg_wchar_table[] = {
721
727
{pg_euckr2wchar_with_len , pg_euckr_mblen , pg_euckr_dsplen , 3 }, /* 3; PG_EUC_KR */
722
728
{pg_euctw2wchar_with_len , pg_euctw_mblen , pg_euctw_dsplen , 3 }, /* 4; PG_EUC_TW */
723
729
{pg_johab2wchar_with_len , pg_johab_mblen , pg_johab_dsplen , 3 }, /* 5; PG_JOHAB */
724
- {pg_utf2wchar_with_len , pg_utf_mblen , pg_utf_dsplen , 3 }, /* 6; PG_UTF8 */
730
+ {pg_utf2wchar_with_len , pg_utf_mblen , pg_utf_dsplen , 4 }, /* 6; PG_UTF8 */
725
731
{pg_mule2wchar_with_len , pg_mule_mblen , pg_mule_dsplen , 3 }, /* 7; PG_MULE_INTERNAL */
726
732
{pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , 1 }, /* 8; PG_LATIN1 */
727
733
{pg_latin12wchar_with_len , pg_latin1_mblen , pg_latin1_dsplen , 1 }, /* 9; PG_LATIN2 */
@@ -800,6 +806,31 @@ pg_encoding_max_length(int encoding)
800
806
801
807
#ifndef FRONTEND
802
808
809
+ bool pg_utf8_islegal (const unsigned char * source , int length ) {
810
+ unsigned char a ;
811
+ const unsigned char * srcptr = source + length ;
812
+ switch (length ) {
813
+ default : return false;
814
+ /* Everything else falls through when "true"... */
815
+ case 4 : if ((a = (* -- srcptr )) < 0x80 || a > 0xBF ) return false;
816
+ case 3 : if ((a = (* -- srcptr )) < 0x80 || a > 0xBF ) return false;
817
+ case 2 : if ((a = (* -- srcptr )) > 0xBF ) return false;
818
+ switch (* source ) {
819
+ /* no fall-through in this inner switch */
820
+ case 0xE0 : if (a < 0xA0 ) return false; break ;
821
+ case 0xED : if (a > 0x9F ) return false; break ;
822
+ case 0xF0 : if (a < 0x90 ) return false; break ;
823
+ case 0xF4 : if (a > 0x8F ) return false; break ;
824
+ default : if (a < 0x80 ) return false;
825
+ }
826
+
827
+ case 1 : if (* source >= 0x80 && * source < 0xC2 ) return false;
828
+ }
829
+ if (* source > 0xF4 ) return false;
830
+ return true;
831
+ }
832
+
833
+
803
834
/*
804
835
* Verify mbstr to make sure that it has a valid character sequence.
805
836
* mbstr is not necessarily NULL terminated; length of mbstr is
@@ -823,51 +854,47 @@ pg_verifymbstr(const unsigned char *mbstr, int len, bool noError)
823
854
824
855
while (len > 0 && * mbstr )
825
856
{
826
- /* special UTF8 check */
827
- if (encoding == PG_UTF8 && (* mbstr & 0xf8 ) == 0xf0 )
828
- {
829
- if (noError )
830
- return false;
831
- ereport (ERROR ,
832
- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
833
- errmsg ("Unicode characters greater than or equal to 0x10000 are not supported" )));
834
- }
835
-
836
857
l = pg_mblen (mbstr );
837
-
838
- for (i = 1 ; i < l ; i ++ )
839
- {
840
- /*
841
- * we expect that every multibyte char consists of bytes
842
- * having the 8th bit set
843
- */
844
- if (i >= len || (mbstr [i ] & 0x80 ) == 0 )
858
+
859
+ /* special UTF-8 check */
860
+ if (encoding == PG_UTF8 ) {
861
+ if (!pg_utf8_islegal (mbstr ,l )) {
862
+ if (noError ) return false;
863
+ ereport (ERROR ,(errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),errmsg ("Invalid UNICODE byte sequence detected near byte %c" ,* mbstr )));
864
+ }
865
+ } else {
866
+ for (i = 1 ; i < l ; i ++ )
845
867
{
846
- char buf [8 * 2 + 1 ];
847
- char * p = buf ;
848
- int j ,
868
+ /*
869
+ * we expect that every multibyte char consists of bytes
870
+ * having the 8th bit set
871
+ */
872
+ if (i >= len || (mbstr [i ] & 0x80 ) == 0 )
873
+ {
874
+ char buf [8 * 2 + 1 ];
875
+ char * p = buf ;
876
+ int j ,
849
877
jlimit ;
850
878
851
- if (noError )
852
- return false;
879
+ if (noError )
880
+ return false;
853
881
854
- jlimit = Min (l , len );
855
- jlimit = Min (jlimit , 8 ); /* prevent buffer overrun */
882
+ jlimit = Min (l , len );
883
+ jlimit = Min (jlimit , 8 ); /* prevent buffer overrun */
856
884
857
- for (j = 0 ; j < jlimit ; j ++ )
858
- p += sprintf (p , "%02x" , mbstr [j ]);
885
+ for (j = 0 ; j < jlimit ; j ++ )
886
+ p += sprintf (p , "%02x" , mbstr [j ]);
859
887
860
- ereport (ERROR ,
861
- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
862
- errmsg ("invalid byte sequence for encoding \"%s\": 0x%s" ,
863
- GetDatabaseEncodingName (), buf )));
888
+ ereport (ERROR ,
889
+ (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
890
+ errmsg ("invalid byte sequence for encoding \"%s\": 0x%s" ,
891
+ GetDatabaseEncodingName (), buf )));
892
+ }
864
893
}
865
894
}
866
-
867
895
len -= l ;
868
896
mbstr += l ;
869
897
}
870
-
871
898
return true;
872
899
}
873
900
0 commit comments