Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c01999a

Browse files
committed
Allow psql multi-line column values to align in the proper columns
If the second output column value is 'a\nb', the 'b' should appear in the second display column, rather than the first column as it does now. Change libpq's PQdsplen() to return more useful values. > Note: this changes the PQdsplen function, it can now return zero or > minus one which was not possible before. It doesn't appear anyone is > actually using the functions other than psql but it is a change. The > functions are not actually documentated anywhere so it's not like we're > breaking a defined interface. The new semantics follow the Unicode > standard. BACKWARD COMPATIBLE CHANGE. The only user-visible change I saw in the regression tests is that a SELECT * on a table where all the columns have been dropped doesn't return a blank line like before. This seems like a step forward. Martijn van Oosterhout
1 parent 593763c commit c01999a

File tree

6 files changed

+712
-404
lines changed

6 files changed

+712
-404
lines changed

src/backend/utils/mb/wchar.c

+197-16
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* conversion functions between pg_wchar and multibyte streams.
33
* Tatsuo Ishii
4-
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.52 2005/12/26 19:30:44 momjian Exp $
4+
* $PostgreSQL: pgsql/src/backend/utils/mb/wchar.c,v 1.53 2006/02/10 00:39:04 momjian Exp $
55
*
66
* WIN1250 client encoding updated by Pavel Behal
77
*
@@ -23,6 +23,13 @@
2323
* for the particular encoding. Note that if the encoding is only
2424
* supported in the client, you don't need to define
2525
* mb2wchar_with_len() function (SJIS is the case).
26+
*
27+
* Note: for the display output of psql to work properly, the return values
28+
* of these functions must conform to the Unicode standard. In particular
29+
* the NUL character is zero width and control characters are generally
30+
* width -1. It is recommended that non-ASCII encodings refer their ASCII
31+
* subset to the ASCII routines to ensure consistancy.
32+
*
2633
*/
2734

2835
/*
@@ -53,6 +60,11 @@ pg_ascii_mblen(const unsigned char *s)
5360
static int
5461
pg_ascii_dsplen(const unsigned char *s)
5562
{
63+
if (*s == '\0')
64+
return 0;
65+
if (*s < 0x20 || *s == 0x7f)
66+
return -1;
67+
5668
return 1;
5769
}
5870

@@ -125,7 +137,7 @@ pg_euc_dsplen(const unsigned char *s)
125137
else if (IS_HIGHBIT_SET(*s))
126138
len = 2;
127139
else
128-
len = 1;
140+
len = pg_ascii_dsplen(s);
129141
return len;
130142
}
131143

@@ -156,7 +168,7 @@ pg_eucjp_dsplen(const unsigned char *s)
156168
else if (IS_HIGHBIT_SET(*s))
157169
len = 2;
158170
else
159-
len = 1;
171+
len = pg_ascii_dsplen(s);
160172
return len;
161173
}
162174

@@ -244,7 +256,7 @@ pg_euccn_dsplen(const unsigned char *s)
244256
if (IS_HIGHBIT_SET(*s))
245257
len = 2;
246258
else
247-
len = 1;
259+
len = pg_ascii_dsplen(s);
248260
return len;
249261
}
250262

@@ -304,7 +316,7 @@ pg_euctw_mblen(const unsigned char *s)
304316
else if (IS_HIGHBIT_SET(*s))
305317
len = 2;
306318
else
307-
len = 1;
319+
len = pg_ascii_dsplen(s);
308320
return len;
309321
}
310322

@@ -320,7 +332,7 @@ pg_euctw_dsplen(const unsigned char *s)
320332
else if (IS_HIGHBIT_SET(*s))
321333
len = 2;
322334
else
323-
len = 1;
335+
len = pg_ascii_dsplen(s);
324336
return len;
325337
}
326338

@@ -419,10 +431,179 @@ pg_utf_mblen(const unsigned char *s)
419431
return len;
420432
}
421433

434+
/*
435+
* This is an implementation of wcwidth() and wcswidth() as defined in
436+
* "The Single UNIX Specification, Version 2, The Open Group, 1997"
437+
* <http://www.UNIX-systems.org/online.html>
438+
*
439+
* Markus Kuhn -- 2001-09-08 -- public domain
440+
*
441+
* customised for PostgreSQL
442+
*
443+
* original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
444+
*/
445+
446+
struct mbinterval
447+
{
448+
unsigned short first;
449+
unsigned short last;
450+
};
451+
452+
/* auxiliary function for binary search in interval table */
453+
static int
454+
mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
455+
{
456+
int min = 0;
457+
int mid;
458+
459+
if (ucs < table[0].first || ucs > table[max].last)
460+
return 0;
461+
while (max >= min)
462+
{
463+
mid = (min + max) / 2;
464+
if (ucs > table[mid].last)
465+
min = mid + 1;
466+
else if (ucs < table[mid].first)
467+
max = mid - 1;
468+
else
469+
return 1;
470+
}
471+
472+
return 0;
473+
}
474+
475+
476+
/* The following functions define the column width of an ISO 10646
477+
* character as follows:
478+
*
479+
* - The null character (U+0000) has a column width of 0.
480+
*
481+
* - Other C0/C1 control characters and DEL will lead to a return
482+
* value of -1.
483+
*
484+
* - Non-spacing and enclosing combining characters (general
485+
* category code Mn or Me in the Unicode database) have a
486+
* column width of 0.
487+
*
488+
* - Other format characters (general category code Cf in the Unicode
489+
* database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
490+
*
491+
* - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
492+
* have a column width of 0.
493+
*
494+
* - Spacing characters in the East Asian Wide (W) or East Asian
495+
* FullWidth (F) category as defined in Unicode Technical
496+
* Report #11 have a column width of 2.
497+
*
498+
* - All remaining characters (including all printable
499+
* ISO 8859-1 and WGL4 characters, Unicode control characters,
500+
* etc.) have a column width of 1.
501+
*
502+
* This implementation assumes that wchar_t characters are encoded
503+
* in ISO 10646.
504+
*/
505+
506+
static int
507+
ucs_wcwidth(pg_wchar ucs)
508+
{
509+
/* sorted list of non-overlapping intervals of non-spacing characters */
510+
static const struct mbinterval combining[] = {
511+
{0x0300, 0x034E}, {0x0360, 0x0362}, {0x0483, 0x0486},
512+
{0x0488, 0x0489}, {0x0591, 0x05A1}, {0x05A3, 0x05B9},
513+
{0x05BB, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2},
514+
{0x05C4, 0x05C4}, {0x064B, 0x0655}, {0x0670, 0x0670},
515+
{0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED},
516+
{0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A},
517+
{0x07A6, 0x07B0}, {0x0901, 0x0902}, {0x093C, 0x093C},
518+
{0x0941, 0x0948}, {0x094D, 0x094D}, {0x0951, 0x0954},
519+
{0x0962, 0x0963}, {0x0981, 0x0981}, {0x09BC, 0x09BC},
520+
{0x09C1, 0x09C4}, {0x09CD, 0x09CD}, {0x09E2, 0x09E3},
521+
{0x0A02, 0x0A02}, {0x0A3C, 0x0A3C}, {0x0A41, 0x0A42},
522+
{0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A70, 0x0A71},
523+
{0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC5},
524+
{0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD}, {0x0B01, 0x0B01},
525+
{0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43},
526+
{0x0B4D, 0x0B4D}, {0x0B56, 0x0B56}, {0x0B82, 0x0B82},
527+
{0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40},
528+
{0x0C46, 0x0C48}, {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56},
529+
{0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
530+
{0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, {0x0DCA, 0x0DCA},
531+
{0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6}, {0x0E31, 0x0E31},
532+
{0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
533+
{0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, {0x0EC8, 0x0ECD},
534+
{0x0F18, 0x0F19}, {0x0F35, 0x0F35}, {0x0F37, 0x0F37},
535+
{0x0F39, 0x0F39}, {0x0F71, 0x0F7E}, {0x0F80, 0x0F84},
536+
{0x0F86, 0x0F87}, {0x0F90, 0x0F97}, {0x0F99, 0x0FBC},
537+
{0x0FC6, 0x0FC6}, {0x102D, 0x1030}, {0x1032, 0x1032},
538+
{0x1036, 0x1037}, {0x1039, 0x1039}, {0x1058, 0x1059},
539+
{0x1160, 0x11FF}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
540+
{0x17C9, 0x17D3}, {0x180B, 0x180E}, {0x18A9, 0x18A9},
541+
{0x200B, 0x200F}, {0x202A, 0x202E}, {0x206A, 0x206F},
542+
{0x20D0, 0x20E3}, {0x302A, 0x302F}, {0x3099, 0x309A},
543+
{0xFB1E, 0xFB1E}, {0xFE20, 0xFE23}, {0xFEFF, 0xFEFF},
544+
{0xFFF9, 0xFFFB}
545+
};
546+
547+
/* test for 8-bit control characters */
548+
if (ucs == 0)
549+
return 0;
550+
551+
if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
552+
return -1;
553+
554+
/* binary search in table of non-spacing characters */
555+
if (mbbisearch(ucs, combining,
556+
sizeof(combining) / sizeof(struct mbinterval) - 1))
557+
return 0;
558+
559+
/*
560+
* if we arrive here, ucs is not a combining or C0/C1 control character
561+
*/
562+
563+
return 1 +
564+
(ucs >= 0x1100 &&
565+
(ucs <= 0x115f || /* Hangul Jamo init. consonants */
566+
(ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
567+
ucs != 0x303f) || /* CJK ... Yi */
568+
(ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
569+
(ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
570+
* Ideographs */
571+
(ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
572+
(ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
573+
(ucs >= 0xffe0 && ucs <= 0xffe6) ||
574+
(ucs >= 0x20000 && ucs <= 0x2ffff)));
575+
}
576+
577+
static pg_wchar
578+
utf2ucs(const unsigned char *c)
579+
{
580+
/*
581+
* one char version of pg_utf2wchar_with_len. no control here, c must
582+
* point to a large enough string
583+
*/
584+
if ((*c & 0x80) == 0)
585+
return (pg_wchar) c[0];
586+
else if ((*c & 0xe0) == 0xc0)
587+
return (pg_wchar) (((c[0] & 0x1f) << 6) |
588+
(c[1] & 0x3f));
589+
else if ((*c & 0xf0) == 0xe0)
590+
return (pg_wchar) (((c[0] & 0x0f) << 12) |
591+
((c[1] & 0x3f) << 6) |
592+
(c[2] & 0x3f));
593+
else if ((*c & 0xf0) == 0xf0)
594+
return (pg_wchar) (((c[0] & 0x07) << 18) |
595+
((c[1] & 0x3f) << 12) |
596+
((c[2] & 0x3f) << 6) |
597+
(c[3] & 0x3f));
598+
else
599+
/* that is an invalid code on purpose */
600+
return 0xffffffff;
601+
}
602+
422603
static int
423604
pg_utf_dsplen(const unsigned char *s)
424605
{
425-
return 1; /* XXX fix me! */
606+
return ucs_wcwidth(utf2ucs(s));
426607
}
427608

428609
/*
@@ -499,7 +680,7 @@ pg_mule_mblen(const unsigned char *s)
499680
static int
500681
pg_mule_dsplen(const unsigned char *s)
501682
{
502-
return 1; /* XXX fix me! */
683+
return pg_ascii_dsplen(s); /* XXX fix me! */
503684
}
504685

505686
/*
@@ -529,7 +710,7 @@ pg_latin1_mblen(const unsigned char *s)
529710
static int
530711
pg_latin1_dsplen(const unsigned char *s)
531712
{
532-
return 1;
713+
return pg_ascii_dsplen(s);
533714
}
534715

535716
/*
@@ -559,7 +740,7 @@ pg_sjis_dsplen(const unsigned char *s)
559740
else if (IS_HIGHBIT_SET(*s))
560741
len = 2; /* kanji? */
561742
else
562-
len = 1; /* should be ASCII */
743+
len = pg_ascii_dsplen(s); /* should be ASCII */
563744
return len;
564745
}
565746

@@ -586,7 +767,7 @@ pg_big5_dsplen(const unsigned char *s)
586767
if (IS_HIGHBIT_SET(*s))
587768
len = 2; /* kanji? */
588769
else
589-
len = 1; /* should be ASCII */
770+
len = pg_ascii_dsplen(s); /* should be ASCII */
590771
return len;
591772
}
592773

@@ -613,7 +794,7 @@ pg_gbk_dsplen(const unsigned char *s)
613794
if (IS_HIGHBIT_SET(*s))
614795
len = 2; /* kanji? */
615796
else
616-
len = 1; /* should be ASCII */
797+
len = pg_ascii_dsplen(s); /* should be ASCII */
617798
return len;
618799
}
619800

@@ -640,7 +821,7 @@ pg_uhc_dsplen(const unsigned char *s)
640821
if (IS_HIGHBIT_SET(*s))
641822
len = 2; /* 2byte? */
642823
else
643-
len = 1; /* should be ASCII */
824+
len = pg_ascii_dsplen(s); /* should be ASCII */
644825
return len;
645826
}
646827

@@ -672,10 +853,10 @@ pg_gb18030_dsplen(const unsigned char *s)
672853
{
673854
int len;
674855

675-
if (!IS_HIGHBIT_SET(*s))
676-
len = 1; /* ASCII */
677-
else
856+
if (IS_HIGHBIT_SET(*s))
678857
len = 2;
858+
else
859+
len = pg_ascii_dsplen(s); /* ASCII */
679860
return len;
680861
}
681862

0 commit comments

Comments
 (0)