Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 654dcfb

Browse files
committed
Clean up ts_locale.h/.c. Fix broken and not-consistent-across-platforms
behavior of wchar2char/char2wchar; this should resolve bug #3730. Avoid excess computations of pg_mblen in t_isalpha and friends. Const-ify APIs where possible.
1 parent 83290b6 commit 654dcfb

File tree

5 files changed

+150
-112
lines changed

5 files changed

+150
-112
lines changed

src/backend/tsearch/ts_locale.c

+121-64
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
/*-------------------------------------------------------------------------
22
*
33
* ts_locale.c
4-
* locale compatiblility layer for tsearch
4+
* locale compatibility layer for tsearch
55
*
66
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.3 2007/11/09 22:37:35 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -16,113 +16,174 @@
1616
#include "tsearch/ts_locale.h"
1717
#include "tsearch/ts_public.h"
1818

19-
#ifdef TS_USE_WIDE
2019

21-
#ifdef WIN32
20+
#ifdef TS_USE_WIDE
2221

22+
/*
23+
* wchar2char --- convert wide characters to multibyte format
24+
*
25+
* This has the same API as the standard wcstombs() function; in particular,
26+
* tolen is the maximum number of bytes to store at *to, and *from should be
27+
* zero-terminated. The output will be zero-terminated iff there is room.
28+
*/
2329
size_t
24-
wchar2char(char *to, const wchar_t *from, size_t len)
30+
wchar2char(char *to, const wchar_t *from, size_t tolen)
2531
{
26-
if (len == 0)
32+
if (tolen == 0)
2733
return 0;
2834

35+
#ifdef WIN32
2936
if (GetDatabaseEncoding() == PG_UTF8)
3037
{
3138
int r;
3239

33-
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
40+
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, tolen,
3441
NULL, NULL);
3542

36-
if (r == 0)
37-
ereport(ERROR,
38-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
39-
errmsg("UTF-16 to UTF-8 translation failed: %lu",
40-
GetLastError())));
41-
Assert(r <= len);
43+
if (r <= 0)
44+
return (size_t) -1;
45+
46+
Assert(r <= tolen);
4247

43-
return r;
48+
/* Microsoft counts the zero terminator in the result */
49+
return r-1;
4450
}
51+
#endif /* WIN32 */
4552

46-
return wcstombs(to, from, len);
53+
return wcstombs(to, from, tolen);
4754
}
48-
#endif /* WIN32 */
4955

56+
/*
57+
* char2wchar --- convert multibyte characters to wide characters
58+
*
59+
* This has almost the API of mbstowcs(), except that *from need not be
60+
* null-terminated; instead, the number of input bytes is specified as
61+
* fromlen. Also, we ereport() rather than returning -1 for invalid
62+
* input encoding. tolen is the maximum number of wchar_t's to store at *to.
63+
* The output will be zero-terminated iff there is room.
64+
*/
5065
size_t
51-
char2wchar(wchar_t *to, const char *from, size_t len)
66+
char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
5267
{
53-
if (len == 0)
68+
if (tolen == 0)
5469
return 0;
5570

5671
#ifdef WIN32
5772
if (GetDatabaseEncoding() == PG_UTF8)
5873
{
5974
int r;
6075

61-
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
76+
r = MultiByteToWideChar(CP_UTF8, 0, from, fromlen, to, tolen);
6277

63-
if (!r)
78+
if (r <= 0)
6479
{
65-
pg_verifymbstr(from, len, false);
80+
pg_verifymbstr(from, fromlen, false);
6681
ereport(ERROR,
6782
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
6883
errmsg("invalid multibyte character for locale"),
6984
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
7085
}
7186

72-
Assert(r <= len);
87+
Assert(r <= tolen);
7388

74-
return r;
89+
/* Microsoft counts the zero terminator in the result */
90+
return r-1;
7591
}
76-
else
7792
#endif /* WIN32 */
93+
7894
if (lc_ctype_is_c())
7995
{
8096
/*
8197
* pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
8298
* allocated with sufficient space
8399
*/
84-
return pg_mb2wchar_with_len(from, (pg_wchar *) to, len);
100+
return pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen);
85101
}
86102
else
87103
{
88104
/*
89-
* mbstowcs require ending '\0'
105+
* mbstowcs requires ending '\0'
90106
*/
91-
char *str = pnstrdup(from, len);
92-
size_t tolen;
107+
char *str = pnstrdup(from, fromlen);
108+
size_t result;
109+
110+
result = mbstowcs(to, str, tolen);
93111

94-
tolen = mbstowcs(to, str, len);
95112
pfree(str);
96113

97-
return tolen;
114+
if (result == (size_t) -1)
115+
{
116+
pg_verifymbstr(from, fromlen, false);
117+
ereport(ERROR,
118+
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
119+
errmsg("invalid multibyte character for locale"),
120+
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
121+
}
122+
123+
if (result < tolen)
124+
to[result] = 0;
125+
126+
return result;
98127
}
99128
}
100129

130+
101131
int
102-
_t_isalpha(const char *ptr)
132+
t_isdigit(const char *ptr)
103133
{
134+
int clen = pg_mblen(ptr);
104135
wchar_t character[2];
105136

106-
if (lc_ctype_is_c())
137+
if (clen == 1 || lc_ctype_is_c())
138+
return isdigit(TOUCHAR(ptr));
139+
140+
char2wchar(character, 2, ptr, clen);
141+
142+
return iswdigit((wint_t) character[0]);
143+
}
144+
145+
int
146+
t_isspace(const char *ptr)
147+
{
148+
int clen = pg_mblen(ptr);
149+
wchar_t character[2];
150+
151+
if (clen == 1 || lc_ctype_is_c())
152+
return isspace(TOUCHAR(ptr));
153+
154+
char2wchar(character, 2, ptr, clen);
155+
156+
return iswspace((wint_t) character[0]);
157+
}
158+
159+
int
160+
t_isalpha(const char *ptr)
161+
{
162+
int clen = pg_mblen(ptr);
163+
wchar_t character[2];
164+
165+
if (clen == 1 || lc_ctype_is_c())
107166
return isalpha(TOUCHAR(ptr));
108167

109-
char2wchar(character, ptr, 1);
168+
char2wchar(character, 2, ptr, clen);
110169

111-
return iswalpha((wint_t) *character);
170+
return iswalpha((wint_t) character[0]);
112171
}
113172

114173
int
115-
_t_isprint(const char *ptr)
174+
t_isprint(const char *ptr)
116175
{
176+
int clen = pg_mblen(ptr);
117177
wchar_t character[2];
118178

119-
if (lc_ctype_is_c())
179+
if (clen == 1 || lc_ctype_is_c())
120180
return isprint(TOUCHAR(ptr));
121181

122-
char2wchar(character, ptr, 1);
182+
char2wchar(character, 2, ptr, clen);
123183

124-
return iswprint((wint_t) *character);
184+
return iswprint((wint_t) character[0]);
125185
}
186+
126187
#endif /* TS_USE_WIDE */
127188

128189

@@ -168,19 +229,27 @@ t_readline(FILE *fp)
168229
return recoded;
169230
}
170231

232+
/*
233+
* lowerstr --- fold null-terminated string to lower case
234+
*
235+
* Returned string is palloc'd
236+
*/
171237
char *
172-
lowerstr(char *str)
238+
lowerstr(const char *str)
173239
{
174240
return lowerstr_with_len(str, strlen(str));
175241
}
176242

177243
/*
244+
* lowerstr_with_len --- fold string to lower case
245+
*
246+
* Input string need not be null-terminated.
247+
*
178248
* Returned string is palloc'd
179249
*/
180250
char *
181-
lowerstr_with_len(char *str, int len)
251+
lowerstr_with_len(const char *str, int len)
182252
{
183-
char *ptr = str;
184253
char *out;
185254

186255
if (len == 0)
@@ -202,23 +271,13 @@ lowerstr_with_len(char *str, int len)
202271

203272
/*
204273
* alloc number of wchar_t for worst case, len contains number of
205-
* bytes <= number of characters and alloc 1 wchar_t for 0, because
206-
* wchar2char(wcstombs in really) wants zero-terminated string
274+
* bytes >= number of characters and alloc 1 wchar_t for 0, because
275+
* wchar2char wants zero-terminated string
207276
*/
208277
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
209278

210-
/*
211-
* str SHOULD be cstring, so wlen contains number of converted
212-
* character
213-
*/
214-
wlen = char2wchar(wstr, str, len);
215-
if (wlen < 0)
216-
ereport(ERROR,
217-
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
218-
errmsg("translation failed from server encoding to wchar_t")));
219-
279+
wlen = char2wchar(wstr, len+1, str, len);
220280
Assert(wlen <= len);
221-
wstr[wlen] = 0;
222281

223282
while (*wptr)
224283
{
@@ -229,31 +288,29 @@ lowerstr_with_len(char *str, int len)
229288
/*
230289
* Alloc result string for worst case + '\0'
231290
*/
232-
len = sizeof(char) * pg_database_encoding_max_length() *(wlen + 1);
291+
len = pg_database_encoding_max_length() * wlen + 1;
233292
out = (char *) palloc(len);
234293

235-
/*
236-
* wlen now is number of bytes which is always >= number of characters
237-
*/
238294
wlen = wchar2char(out, wstr, len);
295+
239296
pfree(wstr);
240297

241298
if (wlen < 0)
242299
ereport(ERROR,
243300
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
244-
errmsg("translation failed from wchar_t to server encoding %d", errno)));
245-
Assert(wlen <= len);
246-
out[wlen] = '\0';
301+
errmsg("translation from wchar_t to server encoding failed: %m")));
302+
Assert(wlen < len);
247303
}
248304
else
249-
#endif
305+
#endif /* TS_USE_WIDE */
250306
{
307+
const char *ptr = str;
251308
char *outptr;
252309

253310
outptr = out = (char *) palloc(sizeof(char) * (len + 1));
254-
while (*ptr && ptr - str < len)
311+
while ((ptr - str) < len && *ptr)
255312
{
256-
*outptr++ = tolower(*(unsigned char *) ptr);
313+
*outptr++ = tolower(TOUCHAR(ptr));
257314
ptr++;
258315
}
259316
*outptr = '\0';

src/backend/tsearch/ts_utils.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.4 2007/09/04 02:16:56 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.5 2007/11/09 22:37:35 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -75,7 +75,7 @@ comparestr(const void *a, const void *b)
7575
* or palloc a new version.
7676
*/
7777
void
78-
readstoplist(const char *fname, StopList *s, char *(*wordop) (char *))
78+
readstoplist(const char *fname, StopList *s, char *(*wordop) (const char *))
7979
{
8080
char **stop = NULL;
8181

src/backend/tsearch/wparser_def.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.7 2007/10/27 19:03:45 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.8 2007/11/09 22:37:35 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -294,12 +294,12 @@ TParserInit(char *str, int len)
294294
/*
295295
* Use wide char code only when max encoding length > 1.
296296
*/
297-
298297
if (prs->charmaxlen > 1)
299298
{
300299
prs->usewide = true;
301300
prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
302-
prs->lenwstr = char2wchar(prs->wstr, prs->str, prs->lenstr);
301+
prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
302+
prs->str, prs->lenstr);
303303
}
304304
else
305305
#endif

0 commit comments

Comments
 (0)