Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 46e5441

Browse files
committed
Add unicode_strtitle() for Unicode Default Case Conversion.
This brings the titlecasing implementation for the builtin provider out of formatting.c and into unicode_case.c, along with unicode_strlower() and unicode_strupper(). Accepts an arbitrary word boundary callback. Simple for now, but can be extended to support the Unicode Default Case Conversion algorithm with full case mapping. Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com Reviewed-by: Peter Eisentraut
1 parent a96a8b1 commit 46e5441

File tree

3 files changed

+140
-48
lines changed

3 files changed

+140
-48
lines changed

src/backend/utils/adt/formatting.c

+67-40
Original file line numberDiff line numberDiff line change
@@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
19221922
return result;
19231923
}
19241924

1925+
struct WordBoundaryState
1926+
{
1927+
const char *str;
1928+
size_t len;
1929+
size_t offset;
1930+
bool init;
1931+
bool prev_alnum;
1932+
};
1933+
1934+
/*
1935+
* Simple word boundary iterator that draws boundaries each time the result of
1936+
* pg_u_isalnum() changes.
1937+
*/
1938+
static size_t
1939+
initcap_wbnext(void *state)
1940+
{
1941+
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
1942+
1943+
while (wbstate->offset < wbstate->len &&
1944+
wbstate->str[wbstate->offset] != '\0')
1945+
{
1946+
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
1947+
wbstate->offset);
1948+
bool curr_alnum = pg_u_isalnum(u, true);
1949+
1950+
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
1951+
{
1952+
size_t prev_offset = wbstate->offset;
1953+
1954+
wbstate->init = true;
1955+
wbstate->offset += unicode_utf8len(u);
1956+
wbstate->prev_alnum = curr_alnum;
1957+
return prev_offset;
1958+
}
1959+
1960+
wbstate->offset += unicode_utf8len(u);
1961+
}
1962+
1963+
return wbstate->len;
1964+
}
1965+
19251966
/*
19261967
* collation-aware, wide-character-aware initcap function
19271968
*
@@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
19802021
#endif
19812022
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
19822023
{
1983-
const unsigned char *src = (unsigned char *) buff;
2024+
const char *src = buff;
19842025
size_t srclen = nbytes;
1985-
unsigned char *dst;
19862026
size_t dstsize;
1987-
int srcoff = 0;
1988-
int dstoff = 0;
2027+
char *dst;
2028+
size_t needed;
2029+
struct WordBoundaryState wbstate = {
2030+
.str = src,
2031+
.len = srclen,
2032+
.offset = 0,
2033+
.init = false,
2034+
.prev_alnum = false,
2035+
};
19892036

19902037
Assert(GetDatabaseEncoding() == PG_UTF8);
19912038

1992-
/* overflow paranoia */
1993-
if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
1994-
ereport(ERROR,
1995-
(errcode(ERRCODE_OUT_OF_MEMORY),
1996-
errmsg("out of memory")));
1997-
1998-
/* result is at most srclen codepoints plus terminating NUL */
1999-
dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
2000-
dst = (unsigned char *) palloc(dstsize);
2039+
/* first try buffer of equal size plus terminating NUL */
2040+
dstsize = srclen + 1;
2041+
dst = palloc(dstsize);
20012042

2002-
while (srcoff < nbytes)
2043+
needed = unicode_strtitle(dst, dstsize, src, srclen,
2044+
initcap_wbnext, &wbstate);
2045+
if (needed + 1 > dstsize)
20032046
{
2004-
pg_wchar u1 = utf8_to_unicode(src + srcoff);
2005-
pg_wchar u2;
2006-
int u1len = unicode_utf8len(u1);
2007-
int u2len;
2008-
2009-
if (wasalnum)
2010-
u2 = unicode_lowercase_simple(u1);
2011-
else
2012-
u2 = unicode_uppercase_simple(u1);
2047+
/* reset iterator */
2048+
wbstate.offset = 0;
2049+
wbstate.init = false;
20132050

2014-
u2len = unicode_utf8len(u2);
2015-
2016-
Assert(dstoff + u2len + 1 <= dstsize);
2017-
2018-
wasalnum = pg_u_isalnum(u2, true);
2019-
2020-
unicode_to_utf8(u2, dst + dstoff);
2021-
srcoff += u1len;
2022-
dstoff += u2len;
2051+
/* grow buffer if needed and retry */
2052+
dstsize = needed + 1;
2053+
dst = repalloc(dst, dstsize);
2054+
needed = unicode_strtitle(dst, dstsize, src, srclen,
2055+
initcap_wbnext, &wbstate);
2056+
Assert(needed + 1 == dstsize);
20232057
}
20242058

2025-
Assert(dstoff + 1 <= dstsize);
2026-
*(dst + dstoff) = '\0';
2027-
dstoff++;
2028-
2029-
/* allocate result buffer of the right size and free workspace */
2030-
result = palloc(dstoff);
2031-
memcpy(result, dst, dstoff);
2032-
pfree(dst);
2059+
result = dst;
20332060
}
20342061
else
20352062
{

src/common/unicode_case.c

+68-8
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@
2121
#include "mb/pg_wchar.h"
2222

2323
static const pg_case_map *find_case_map(pg_wchar ucs);
24-
static size_t convert_case(char *dst, size_t dstsize, const char *src,
25-
ssize_t srclen, CaseKind casekind);
24+
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
25+
CaseKind str_casekind, WordBoundaryNext wbnext,
26+
void *wbstate);
2627

2728
pg_wchar
2829
unicode_lowercase_simple(pg_wchar code)
@@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
6768
size_t
6869
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
6970
{
70-
return convert_case(dst, dstsize, src, srclen, CaseLower);
71+
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
72+
}
73+
74+
/*
75+
* unicode_strtitle()
76+
*
77+
* Convert src to titlecase, and return the result length (not including
78+
* terminating NUL).
79+
*
80+
* String src must be encoded in UTF-8. If srclen < 0, src must be
81+
* NUL-terminated.
82+
*
83+
* Result string is stored in dst, truncating if larger than dstsize. If
84+
* dstsize is greater than the result length, dst will be NUL-terminated;
85+
* otherwise not.
86+
*
87+
* If dstsize is zero, dst may be NULL. This is useful for calculating the
88+
* required buffer size before allocating.
89+
*
90+
* Titlecasing requires knowledge about word boundaries, which is provided by
91+
* the callback wbnext. A word boundary is the offset of the start of a word
92+
* or the offset of the character immediately following a word.
93+
*
94+
* The caller is expected to initialize and free the callback state
95+
* wbstate. The callback should first return offset 0 for the first boundary;
96+
* then the offset of each subsequent word boundary; then the total length of
97+
* the string to indicate the final boundary.
98+
*/
99+
size_t
100+
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
101+
WordBoundaryNext wbnext, void *wbstate)
102+
{
103+
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
104+
wbstate);
71105
}
72106

73107
/*
@@ -89,30 +123,56 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
89123
size_t
90124
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
91125
{
92-
return convert_case(dst, dstsize, src, srclen, CaseUpper);
126+
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
93127
}
94128

95129
/*
96-
* Implement Unicode Default Case Conversion algorithm.
130+
* If str_casekind is CaseLower or CaseUpper, map each character in the string
131+
* for which a mapping is available.
97132
*
98-
* Map each character in the string for which a mapping is available.
133+
* If str_casekind is CaseTitle, maps characters found on a word boundary to
134+
* uppercase and other characters to lowercase.
99135
*/
100136
static size_t
101137
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
102-
CaseKind casekind)
138+
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
103139
{
140+
/* character CaseKind varies while titlecasing */
141+
CaseKind chr_casekind = str_casekind;
104142
size_t srcoff = 0;
105143
size_t result_len = 0;
144+
size_t boundary = 0;
145+
146+
Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
147+
(str_casekind != CaseTitle && !wbnext && !wbstate));
148+
149+
if (str_casekind == CaseTitle)
150+
{
151+
boundary = wbnext(wbstate);
152+
Assert(boundary == 0); /* start of text is always a boundary */
153+
}
106154

107155
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
108156
{
109157
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
110158
int u1len = unicode_utf8len(u1);
111159
const pg_case_map *casemap = find_case_map(u1);
112160

161+
if (str_casekind == CaseTitle)
162+
{
163+
if (srcoff == boundary)
164+
{
165+
chr_casekind = CaseUpper;
166+
boundary = wbnext(wbstate);
167+
}
168+
else
169+
chr_casekind = CaseLower;
170+
}
171+
172+
/* perform mapping, update result_len, and write to dst */
113173
if (casemap)
114174
{
115-
pg_wchar u2 = casemap->simplemap[casekind];
175+
pg_wchar u2 = casemap->simplemap[chr_casekind];
116176
pg_wchar u2len = unicode_utf8len(u2);
117177

118178
if (result_len + u2len <= dstsize)

src/include/common/unicode_case.h

+5
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,16 @@
1616

1717
#include "mb/pg_wchar.h"
1818

19+
typedef size_t (*WordBoundaryNext) (void *wbstate);
20+
1921
pg_wchar unicode_lowercase_simple(pg_wchar ucs);
2022
pg_wchar unicode_titlecase_simple(pg_wchar ucs);
2123
pg_wchar unicode_uppercase_simple(pg_wchar ucs);
2224
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
2325
ssize_t srclen);
26+
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
27+
ssize_t srclen, WordBoundaryNext wbnext,
28+
void *wbstate);
2429
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
2530
ssize_t srclen);
2631

0 commit comments

Comments
 (0)