1
1
/*
2
2
* conversion functions between pg_wchar and multibyte streams.
3
3
* Tatsuo Ishii
4
- * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.52 2005/12/26 19:30:44 momjian Exp $
4
+ * $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.53 2006/02/10 00:39:04 momjian Exp $
5
5
*
6
6
* WIN1250 client encoding updated by Pavel Behal
7
7
*
23
23
* for the particular encoding. Note that if the encoding is only
24
24
* supported in the client, you don't need to define
25
25
* mb2wchar_with_len() function (SJIS is the case).
26
+ *
27
+ * Note: for the display output of psql to work properly, the return values
28
+ * of these functions must conform to the Unicode standard. In particular
29
+ * the NUL character is zero width and control characters are generally
30
+ * width -1. It is recommended that non-ASCII encodings refer their ASCII
31
+ * subset to the ASCII routines to ensure consistancy.
32
+ *
26
33
*/
27
34
28
35
/*
@@ -53,6 +60,11 @@ pg_ascii_mblen(const unsigned char *s)
53
60
static int
54
61
pg_ascii_dsplen (const unsigned char * s )
55
62
{
63
+ if (* s == '\0' )
64
+ return 0 ;
65
+ if (* s < 0x20 || * s == 0x7f )
66
+ return -1 ;
67
+
56
68
return 1 ;
57
69
}
58
70
@@ -125,7 +137,7 @@ pg_euc_dsplen(const unsigned char *s)
125
137
else if (IS_HIGHBIT_SET (* s ))
126
138
len = 2 ;
127
139
else
128
- len = 1 ;
140
+ len = pg_ascii_dsplen ( s ) ;
129
141
return len ;
130
142
}
131
143
@@ -156,7 +168,7 @@ pg_eucjp_dsplen(const unsigned char *s)
156
168
else if (IS_HIGHBIT_SET (* s ))
157
169
len = 2 ;
158
170
else
159
- len = 1 ;
171
+ len = pg_ascii_dsplen ( s ) ;
160
172
return len ;
161
173
}
162
174
@@ -244,7 +256,7 @@ pg_euccn_dsplen(const unsigned char *s)
244
256
if (IS_HIGHBIT_SET (* s ))
245
257
len = 2 ;
246
258
else
247
- len = 1 ;
259
+ len = pg_ascii_dsplen ( s ) ;
248
260
return len ;
249
261
}
250
262
@@ -304,7 +316,7 @@ pg_euctw_mblen(const unsigned char *s)
304
316
else if (IS_HIGHBIT_SET (* s ))
305
317
len = 2 ;
306
318
else
307
- len = 1 ;
319
+ len = pg_ascii_dsplen ( s ) ;
308
320
return len ;
309
321
}
310
322
@@ -320,7 +332,7 @@ pg_euctw_dsplen(const unsigned char *s)
320
332
else if (IS_HIGHBIT_SET (* s ))
321
333
len = 2 ;
322
334
else
323
- len = 1 ;
335
+ len = pg_ascii_dsplen ( s ) ;
324
336
return len ;
325
337
}
326
338
@@ -419,10 +431,179 @@ pg_utf_mblen(const unsigned char *s)
419
431
return len ;
420
432
}
421
433
434
+ /*
435
+ * This is an implementation of wcwidth() and wcswidth() as defined in
436
+ * "The Single UNIX Specification, Version 2, The Open Group, 1997"
437
+ * <http://www.UNIX-systems.org/online.html>
438
+ *
439
+ * Markus Kuhn -- 2001-09-08 -- public domain
440
+ *
441
+ * customised for PostgreSQL
442
+ *
443
+ * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
444
+ */
445
+
446
+ struct mbinterval
447
+ {
448
+ unsigned short first ;
449
+ unsigned short last ;
450
+ };
451
+
452
+ /* auxiliary function for binary search in interval table */
453
+ static int
454
+ mbbisearch (pg_wchar ucs , const struct mbinterval * table , int max )
455
+ {
456
+ int min = 0 ;
457
+ int mid ;
458
+
459
+ if (ucs < table [0 ].first || ucs > table [max ].last )
460
+ return 0 ;
461
+ while (max >= min )
462
+ {
463
+ mid = (min + max ) / 2 ;
464
+ if (ucs > table [mid ].last )
465
+ min = mid + 1 ;
466
+ else if (ucs < table [mid ].first )
467
+ max = mid - 1 ;
468
+ else
469
+ return 1 ;
470
+ }
471
+
472
+ return 0 ;
473
+ }
474
+
475
+
476
+ /* The following functions define the column width of an ISO 10646
477
+ * character as follows:
478
+ *
479
+ * - The null character (U+0000) has a column width of 0.
480
+ *
481
+ * - Other C0/C1 control characters and DEL will lead to a return
482
+ * value of -1.
483
+ *
484
+ * - Non-spacing and enclosing combining characters (general
485
+ * category code Mn or Me in the Unicode database) have a
486
+ * column width of 0.
487
+ *
488
+ * - Other format characters (general category code Cf in the Unicode
489
+ * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
490
+ *
491
+ * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
492
+ * have a column width of 0.
493
+ *
494
+ * - Spacing characters in the East Asian Wide (W) or East Asian
495
+ * FullWidth (F) category as defined in Unicode Technical
496
+ * Report #11 have a column width of 2.
497
+ *
498
+ * - All remaining characters (including all printable
499
+ * ISO 8859-1 and WGL4 characters, Unicode control characters,
500
+ * etc.) have a column width of 1.
501
+ *
502
+ * This implementation assumes that wchar_t characters are encoded
503
+ * in ISO 10646.
504
+ */
505
+
506
+ static int
507
+ ucs_wcwidth (pg_wchar ucs )
508
+ {
509
+ /* sorted list of non-overlapping intervals of non-spacing characters */
510
+ static const struct mbinterval combining [] = {
511
+ {0x0300 , 0x034E }, {0x0360 , 0x0362 }, {0x0483 , 0x0486 },
512
+ {0x0488 , 0x0489 }, {0x0591 , 0x05A1 }, {0x05A3 , 0x05B9 },
513
+ {0x05BB , 0x05BD }, {0x05BF , 0x05BF }, {0x05C1 , 0x05C2 },
514
+ {0x05C4 , 0x05C4 }, {0x064B , 0x0655 }, {0x0670 , 0x0670 },
515
+ {0x06D6 , 0x06E4 }, {0x06E7 , 0x06E8 }, {0x06EA , 0x06ED },
516
+ {0x070F , 0x070F }, {0x0711 , 0x0711 }, {0x0730 , 0x074A },
517
+ {0x07A6 , 0x07B0 }, {0x0901 , 0x0902 }, {0x093C , 0x093C },
518
+ {0x0941 , 0x0948 }, {0x094D , 0x094D }, {0x0951 , 0x0954 },
519
+ {0x0962 , 0x0963 }, {0x0981 , 0x0981 }, {0x09BC , 0x09BC },
520
+ {0x09C1 , 0x09C4 }, {0x09CD , 0x09CD }, {0x09E2 , 0x09E3 },
521
+ {0x0A02 , 0x0A02 }, {0x0A3C , 0x0A3C }, {0x0A41 , 0x0A42 },
522
+ {0x0A47 , 0x0A48 }, {0x0A4B , 0x0A4D }, {0x0A70 , 0x0A71 },
523
+ {0x0A81 , 0x0A82 }, {0x0ABC , 0x0ABC }, {0x0AC1 , 0x0AC5 },
524
+ {0x0AC7 , 0x0AC8 }, {0x0ACD , 0x0ACD }, {0x0B01 , 0x0B01 },
525
+ {0x0B3C , 0x0B3C }, {0x0B3F , 0x0B3F }, {0x0B41 , 0x0B43 },
526
+ {0x0B4D , 0x0B4D }, {0x0B56 , 0x0B56 }, {0x0B82 , 0x0B82 },
527
+ {0x0BC0 , 0x0BC0 }, {0x0BCD , 0x0BCD }, {0x0C3E , 0x0C40 },
528
+ {0x0C46 , 0x0C48 }, {0x0C4A , 0x0C4D }, {0x0C55 , 0x0C56 },
529
+ {0x0CBF , 0x0CBF }, {0x0CC6 , 0x0CC6 }, {0x0CCC , 0x0CCD },
530
+ {0x0D41 , 0x0D43 }, {0x0D4D , 0x0D4D }, {0x0DCA , 0x0DCA },
531
+ {0x0DD2 , 0x0DD4 }, {0x0DD6 , 0x0DD6 }, {0x0E31 , 0x0E31 },
532
+ {0x0E34 , 0x0E3A }, {0x0E47 , 0x0E4E }, {0x0EB1 , 0x0EB1 },
533
+ {0x0EB4 , 0x0EB9 }, {0x0EBB , 0x0EBC }, {0x0EC8 , 0x0ECD },
534
+ {0x0F18 , 0x0F19 }, {0x0F35 , 0x0F35 }, {0x0F37 , 0x0F37 },
535
+ {0x0F39 , 0x0F39 }, {0x0F71 , 0x0F7E }, {0x0F80 , 0x0F84 },
536
+ {0x0F86 , 0x0F87 }, {0x0F90 , 0x0F97 }, {0x0F99 , 0x0FBC },
537
+ {0x0FC6 , 0x0FC6 }, {0x102D , 0x1030 }, {0x1032 , 0x1032 },
538
+ {0x1036 , 0x1037 }, {0x1039 , 0x1039 }, {0x1058 , 0x1059 },
539
+ {0x1160 , 0x11FF }, {0x17B7 , 0x17BD }, {0x17C6 , 0x17C6 },
540
+ {0x17C9 , 0x17D3 }, {0x180B , 0x180E }, {0x18A9 , 0x18A9 },
541
+ {0x200B , 0x200F }, {0x202A , 0x202E }, {0x206A , 0x206F },
542
+ {0x20D0 , 0x20E3 }, {0x302A , 0x302F }, {0x3099 , 0x309A },
543
+ {0xFB1E , 0xFB1E }, {0xFE20 , 0xFE23 }, {0xFEFF , 0xFEFF },
544
+ {0xFFF9 , 0xFFFB }
545
+ };
546
+
547
+ /* test for 8-bit control characters */
548
+ if (ucs == 0 )
549
+ return 0 ;
550
+
551
+ if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0 ) || ucs > 0x0010ffff )
552
+ return -1 ;
553
+
554
+ /* binary search in table of non-spacing characters */
555
+ if (mbbisearch (ucs , combining ,
556
+ sizeof (combining ) / sizeof (struct mbinterval ) - 1 ))
557
+ return 0 ;
558
+
559
+ /*
560
+ * if we arrive here, ucs is not a combining or C0/C1 control character
561
+ */
562
+
563
+ return 1 +
564
+ (ucs >= 0x1100 &&
565
+ (ucs <= 0x115f || /* Hangul Jamo init. consonants */
566
+ (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011 ) != 0x300a &&
567
+ ucs != 0x303f ) || /* CJK ... Yi */
568
+ (ucs >= 0xac00 && ucs <= 0xd7a3 ) || /* Hangul Syllables */
569
+ (ucs >= 0xf900 && ucs <= 0xfaff ) || /* CJK Compatibility
570
+ * Ideographs */
571
+ (ucs >= 0xfe30 && ucs <= 0xfe6f ) || /* CJK Compatibility Forms */
572
+ (ucs >= 0xff00 && ucs <= 0xff5f ) || /* Fullwidth Forms */
573
+ (ucs >= 0xffe0 && ucs <= 0xffe6 ) ||
574
+ (ucs >= 0x20000 && ucs <= 0x2ffff )));
575
+ }
576
+
577
+ static pg_wchar
578
+ utf2ucs (const unsigned char * c )
579
+ {
580
+ /*
581
+ * one char version of pg_utf2wchar_with_len. no control here, c must
582
+ * point to a large enough string
583
+ */
584
+ if ((* c & 0x80 ) == 0 )
585
+ return (pg_wchar ) c [0 ];
586
+ else if ((* c & 0xe0 ) == 0xc0 )
587
+ return (pg_wchar ) (((c [0 ] & 0x1f ) << 6 ) |
588
+ (c [1 ] & 0x3f ));
589
+ else if ((* c & 0xf0 ) == 0xe0 )
590
+ return (pg_wchar ) (((c [0 ] & 0x0f ) << 12 ) |
591
+ ((c [1 ] & 0x3f ) << 6 ) |
592
+ (c [2 ] & 0x3f ));
593
+ else if ((* c & 0xf0 ) == 0xf0 )
594
+ return (pg_wchar ) (((c [0 ] & 0x07 ) << 18 ) |
595
+ ((c [1 ] & 0x3f ) << 12 ) |
596
+ ((c [2 ] & 0x3f ) << 6 ) |
597
+ (c [3 ] & 0x3f ));
598
+ else
599
+ /* that is an invalid code on purpose */
600
+ return 0xffffffff ;
601
+ }
602
+
422
603
static int
423
604
pg_utf_dsplen (const unsigned char * s )
424
605
{
425
- return 1 ; /* XXX fix me! */
606
+ return ucs_wcwidth ( utf2ucs ( s ));
426
607
}
427
608
428
609
/*
@@ -499,7 +680,7 @@ pg_mule_mblen(const unsigned char *s)
499
680
static int
500
681
pg_mule_dsplen (const unsigned char * s )
501
682
{
502
- return 1 ; /* XXX fix me! */
683
+ return pg_ascii_dsplen ( s ) ; /* XXX fix me! */
503
684
}
504
685
505
686
/*
@@ -529,7 +710,7 @@ pg_latin1_mblen(const unsigned char *s)
529
710
static int
530
711
pg_latin1_dsplen (const unsigned char * s )
531
712
{
532
- return 1 ;
713
+ return pg_ascii_dsplen ( s ) ;
533
714
}
534
715
535
716
/*
@@ -559,7 +740,7 @@ pg_sjis_dsplen(const unsigned char *s)
559
740
else if (IS_HIGHBIT_SET (* s ))
560
741
len = 2 ; /* kanji? */
561
742
else
562
- len = 1 ; /* should be ASCII */
743
+ len = pg_ascii_dsplen ( s ) ; /* should be ASCII */
563
744
return len ;
564
745
}
565
746
@@ -586,7 +767,7 @@ pg_big5_dsplen(const unsigned char *s)
586
767
if (IS_HIGHBIT_SET (* s ))
587
768
len = 2 ; /* kanji? */
588
769
else
589
- len = 1 ; /* should be ASCII */
770
+ len = pg_ascii_dsplen ( s ) ; /* should be ASCII */
590
771
return len ;
591
772
}
592
773
@@ -613,7 +794,7 @@ pg_gbk_dsplen(const unsigned char *s)
613
794
if (IS_HIGHBIT_SET (* s ))
614
795
len = 2 ; /* kanji? */
615
796
else
616
- len = 1 ; /* should be ASCII */
797
+ len = pg_ascii_dsplen ( s ) ; /* should be ASCII */
617
798
return len ;
618
799
}
619
800
@@ -640,7 +821,7 @@ pg_uhc_dsplen(const unsigned char *s)
640
821
if (IS_HIGHBIT_SET (* s ))
641
822
len = 2 ; /* 2byte? */
642
823
else
643
- len = 1 ; /* should be ASCII */
824
+ len = pg_ascii_dsplen ( s ) ; /* should be ASCII */
644
825
return len ;
645
826
}
646
827
@@ -672,10 +853,10 @@ pg_gb18030_dsplen(const unsigned char *s)
672
853
{
673
854
int len ;
674
855
675
- if (!IS_HIGHBIT_SET (* s ))
676
- len = 1 ; /* ASCII */
677
- else
856
+ if (IS_HIGHBIT_SET (* s ))
678
857
len = 2 ;
858
+ else
859
+ len = pg_ascii_dsplen (s ); /* ASCII */
679
860
return len ;
680
861
}
681
862
0 commit comments