Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0d32342

Browse files
committed
Teach the regular expression functions to do case-insensitive matching and
locale-dependent character classification properly when the database encoding is UTF8. The previous coding worked okay in single-byte encodings, or in any case for ASCII characters, but failed entirely on multibyte characters. The fix assumes that the <wctype.h> functions use Unicode code points as the wchar representation for Unicode, ie, wchar matches pg_wchar. This is only a partial solution, since we're still stupid about non-ASCII characters in multibyte encodings other than UTF8. The practical effect of that is limited, however, since those cases are generally Far Eastern glyphs for which concepts like case-folding don't apply anyway. Certainly all or nearly all of the field reports of problems have been about UTF8. A more general solution would require switching to the platform's wchar representation for all regex operations; which is possible but would have substantial disadvantages. Let's try this and see if it's sufficient in practice.
1 parent ef51395 commit 0d32342

File tree

2 files changed

+117
-14
lines changed

2 files changed

+117
-14
lines changed

src/backend/regex/regc_locale.c

Lines changed: 105 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
* permission to use and distribute the software in accordance with the
4848
* terms specified in this license.
4949
*
50-
* $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.9 2008/02/14 17:33:37 tgl Exp $
50+
* $PostgreSQL: pgsql/src/backend/regex/regc_locale.c,v 1.10 2009/12/01 21:00:24 tgl Exp $
5151
*/
5252

5353
/* ASCII character-name table */
@@ -349,75 +349,167 @@ static const struct cname
349349
}
350350
};
351351

352+
352353
/*
353-
* some ctype functions with non-ascii-char guard
354+
* ctype functions adapted to work on pg_wchar (a/k/a chr)
355+
*
356+
* When working in UTF8 encoding, we use the <wctype.h> functions if
357+
* available. This assumes that every platform uses Unicode codepoints
358+
* directly as the wchar_t representation of Unicode. On some platforms
359+
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
360+
*
361+
* In all other encodings, we use the <ctype.h> functions for pg_wchar
362+
* values up to 255, and punt for values above that. This is only 100%
363+
* correct in single-byte encodings such as LATINn. However, non-Unicode
364+
* multibyte encodings are mostly Far Eastern character sets for which the
365+
* properties being tested here aren't relevant for higher code values anyway.
366+
*
367+
* NB: the coding here assumes pg_wchar is an unsigned type.
354368
*/
369+
355370
static int
356371
pg_wc_isdigit(pg_wchar c)
357372
{
358-
return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
373+
#ifdef USE_WIDE_UPPER_LOWER
374+
if (GetDatabaseEncoding() == PG_UTF8)
375+
{
376+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
377+
return iswdigit((wint_t) c);
378+
}
379+
#endif
380+
return (c <= (pg_wchar) UCHAR_MAX && isdigit((unsigned char) c));
359381
}
360382

361383
static int
362384
pg_wc_isalpha(pg_wchar c)
363385
{
364-
return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
386+
#ifdef USE_WIDE_UPPER_LOWER
387+
if (GetDatabaseEncoding() == PG_UTF8)
388+
{
389+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
390+
return iswalpha((wint_t) c);
391+
}
392+
#endif
393+
return (c <= (pg_wchar) UCHAR_MAX && isalpha((unsigned char) c));
365394
}
366395

367396
static int
368397
pg_wc_isalnum(pg_wchar c)
369398
{
370-
return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
399+
#ifdef USE_WIDE_UPPER_LOWER
400+
if (GetDatabaseEncoding() == PG_UTF8)
401+
{
402+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
403+
return iswalnum((wint_t) c);
404+
}
405+
#endif
406+
return (c <= (pg_wchar) UCHAR_MAX && isalnum((unsigned char) c));
371407
}
372408

373409
static int
374410
pg_wc_isupper(pg_wchar c)
375411
{
376-
return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
412+
#ifdef USE_WIDE_UPPER_LOWER
413+
if (GetDatabaseEncoding() == PG_UTF8)
414+
{
415+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
416+
return iswupper((wint_t) c);
417+
}
418+
#endif
419+
return (c <= (pg_wchar) UCHAR_MAX && isupper((unsigned char) c));
377420
}
378421

379422
static int
380423
pg_wc_islower(pg_wchar c)
381424
{
382-
return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
425+
#ifdef USE_WIDE_UPPER_LOWER
426+
if (GetDatabaseEncoding() == PG_UTF8)
427+
{
428+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
429+
return iswlower((wint_t) c);
430+
}
431+
#endif
432+
return (c <= (pg_wchar) UCHAR_MAX && islower((unsigned char) c));
383433
}
384434

385435
static int
386436
pg_wc_isgraph(pg_wchar c)
387437
{
388-
return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
438+
#ifdef USE_WIDE_UPPER_LOWER
439+
if (GetDatabaseEncoding() == PG_UTF8)
440+
{
441+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
442+
return iswgraph((wint_t) c);
443+
}
444+
#endif
445+
return (c <= (pg_wchar) UCHAR_MAX && isgraph((unsigned char) c));
389446
}
390447

391448
static int
392449
pg_wc_isprint(pg_wchar c)
393450
{
394-
return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c));
451+
#ifdef USE_WIDE_UPPER_LOWER
452+
if (GetDatabaseEncoding() == PG_UTF8)
453+
{
454+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
455+
return iswprint((wint_t) c);
456+
}
457+
#endif
458+
return (c <= (pg_wchar) UCHAR_MAX && isprint((unsigned char) c));
395459
}
396460

397461
static int
398462
pg_wc_ispunct(pg_wchar c)
399463
{
400-
return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
464+
#ifdef USE_WIDE_UPPER_LOWER
465+
if (GetDatabaseEncoding() == PG_UTF8)
466+
{
467+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
468+
return iswpunct((wint_t) c);
469+
}
470+
#endif
471+
return (c <= (pg_wchar) UCHAR_MAX && ispunct((unsigned char) c));
401472
}
402473

403474
static int
404475
pg_wc_isspace(pg_wchar c)
405476
{
406-
return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
477+
#ifdef USE_WIDE_UPPER_LOWER
478+
if (GetDatabaseEncoding() == PG_UTF8)
479+
{
480+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
481+
return iswspace((wint_t) c);
482+
}
483+
#endif
484+
return (c <= (pg_wchar) UCHAR_MAX && isspace((unsigned char) c));
407485
}
408486

409487
static pg_wchar
410488
pg_wc_toupper(pg_wchar c)
411489
{
412-
if (c >= 0 && c <= UCHAR_MAX)
490+
#ifdef USE_WIDE_UPPER_LOWER
491+
if (GetDatabaseEncoding() == PG_UTF8)
492+
{
493+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
494+
return towupper((wint_t) c);
495+
}
496+
#endif
497+
if (c <= (pg_wchar) UCHAR_MAX)
413498
return toupper((unsigned char) c);
414499
return c;
415500
}
416501

417502
static pg_wchar
418503
pg_wc_tolower(pg_wchar c)
419504
{
420-
if (c >= 0 && c <= UCHAR_MAX)
505+
#ifdef USE_WIDE_UPPER_LOWER
506+
if (GetDatabaseEncoding() == PG_UTF8)
507+
{
508+
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
509+
return towlower((wint_t) c);
510+
}
511+
#endif
512+
if (c <= (pg_wchar) UCHAR_MAX)
421513
return tolower((unsigned char) c);
422514
return c;
423515
}

src/include/regex/regcustom.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
2626
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727
*
28-
* $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.7 2008/02/14 17:33:37 tgl Exp $
28+
* $PostgreSQL: pgsql/src/include/regex/regcustom.h,v 1.8 2009/12/01 21:00:24 tgl Exp $
2929
*/
3030

3131
/* headers if any */
@@ -34,6 +34,17 @@
3434
#include <ctype.h>
3535
#include <limits.h>
3636

37+
/*
38+
* towlower() and friends should be in <wctype.h>, but some pre-C99 systems
39+
* declare them in <wchar.h>.
40+
*/
41+
#ifdef HAVE_WCHAR_H
42+
#include <wchar.h>
43+
#endif
44+
#ifdef HAVE_WCTYPE_H
45+
#include <wctype.h>
46+
#endif
47+
3748
#include "mb/pg_wchar.h"
3849

3950

0 commit comments

Comments
 (0)