1
1
/*-------------------------------------------------------------------------
2
2
*
3
3
* ts_locale.c
4
- * locale compatiblility layer for tsearch
4
+ * locale compatibility layer for tsearch
5
5
*
6
6
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
7
7
*
8
8
*
9
9
* IDENTIFICATION
10
- * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.2 2007/08/25 00:03:59 tgl Exp $
10
+ * $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.3 2007/11/09 22:37:35 tgl Exp $
11
11
*
12
12
*-------------------------------------------------------------------------
13
13
*/
16
16
#include "tsearch/ts_locale.h"
17
17
#include "tsearch/ts_public.h"
18
18
19
- #ifdef TS_USE_WIDE
20
19
21
- #ifdef WIN32
20
+ #ifdef TS_USE_WIDE
22
21
22
+ /*
23
+ * wchar2char --- convert wide characters to multibyte format
24
+ *
25
+ * This has the same API as the standard wcstombs() function; in particular,
26
+ * tolen is the maximum number of bytes to store at *to, and *from should be
27
+ * zero-terminated. The output will be zero-terminated iff there is room.
28
+ */
23
29
size_t
24
- wchar2char (char * to , const wchar_t * from , size_t len )
30
+ wchar2char (char * to , const wchar_t * from , size_t tolen )
25
31
{
26
- if (len == 0 )
32
+ if (tolen == 0 )
27
33
return 0 ;
28
34
35
+ #ifdef WIN32
29
36
if (GetDatabaseEncoding () == PG_UTF8 )
30
37
{
31
38
int r ;
32
39
33
- r = WideCharToMultiByte (CP_UTF8 , 0 , from , -1 , to , len ,
40
+ r = WideCharToMultiByte (CP_UTF8 , 0 , from , -1 , to , tolen ,
34
41
NULL , NULL );
35
42
36
- if (r == 0 )
37
- ereport (ERROR ,
38
- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
39
- errmsg ("UTF-16 to UTF-8 translation failed: %lu" ,
40
- GetLastError ())));
41
- Assert (r <= len );
43
+ if (r <= 0 )
44
+ return (size_t ) -1 ;
45
+
46
+ Assert (r <= tolen );
42
47
43
- return r ;
48
+ /* Microsoft counts the zero terminator in the result */
49
+ return r - 1 ;
44
50
}
51
+ #endif /* WIN32 */
45
52
46
- return wcstombs (to , from , len );
53
+ return wcstombs (to , from , tolen );
47
54
}
48
- #endif /* WIN32 */
49
55
56
+ /*
57
+ * char2wchar --- convert multibyte characters to wide characters
58
+ *
59
+ * This has almost the API of mbstowcs(), except that *from need not be
60
+ * null-terminated; instead, the number of input bytes is specified as
61
+ * fromlen. Also, we ereport() rather than returning -1 for invalid
62
+ * input encoding. tolen is the maximum number of wchar_t's to store at *to.
63
+ * The output will be zero-terminated iff there is room.
64
+ */
50
65
size_t
51
- char2wchar (wchar_t * to , const char * from , size_t len )
66
+ char2wchar (wchar_t * to , size_t tolen , const char * from , size_t fromlen )
52
67
{
53
- if (len == 0 )
68
+ if (tolen == 0 )
54
69
return 0 ;
55
70
56
71
#ifdef WIN32
57
72
if (GetDatabaseEncoding () == PG_UTF8 )
58
73
{
59
74
int r ;
60
75
61
- r = MultiByteToWideChar (CP_UTF8 , 0 , from , len , to , len );
76
+ r = MultiByteToWideChar (CP_UTF8 , 0 , from , fromlen , to , tolen );
62
77
63
- if (! r )
78
+ if (r <= 0 )
64
79
{
65
- pg_verifymbstr (from , len , false);
80
+ pg_verifymbstr (from , fromlen , false);
66
81
ereport (ERROR ,
67
82
(errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
68
83
errmsg ("invalid multibyte character for locale" ),
69
84
errhint ("The server's LC_CTYPE locale is probably incompatible with the database encoding." )));
70
85
}
71
86
72
- Assert (r <= len );
87
+ Assert (r <= tolen );
73
88
74
- return r ;
89
+ /* Microsoft counts the zero terminator in the result */
90
+ return r - 1 ;
75
91
}
76
- else
77
92
#endif /* WIN32 */
93
+
78
94
if (lc_ctype_is_c ())
79
95
{
80
96
/*
81
97
* pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
82
98
* allocated with sufficient space
83
99
*/
84
- return pg_mb2wchar_with_len (from , (pg_wchar * ) to , len );
100
+ return pg_mb2wchar_with_len (from , (pg_wchar * ) to , fromlen );
85
101
}
86
102
else
87
103
{
88
104
/*
89
- * mbstowcs require ending '\0'
105
+ * mbstowcs requires ending '\0'
90
106
*/
91
- char * str = pnstrdup (from , len );
92
- size_t tolen ;
107
+ char * str = pnstrdup (from , fromlen );
108
+ size_t result ;
109
+
110
+ result = mbstowcs (to , str , tolen );
93
111
94
- tolen = mbstowcs (to , str , len );
95
112
pfree (str );
96
113
97
- return tolen ;
114
+ if (result == (size_t ) -1 )
115
+ {
116
+ pg_verifymbstr (from , fromlen , false);
117
+ ereport (ERROR ,
118
+ (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
119
+ errmsg ("invalid multibyte character for locale" ),
120
+ errhint ("The server's LC_CTYPE locale is probably incompatible with the database encoding." )));
121
+ }
122
+
123
+ if (result < tolen )
124
+ to [result ] = 0 ;
125
+
126
+ return result ;
98
127
}
99
128
}
100
129
130
+
101
131
int
102
- _t_isalpha (const char * ptr )
132
+ t_isdigit (const char * ptr )
103
133
{
134
+ int clen = pg_mblen (ptr );
104
135
wchar_t character [2 ];
105
136
106
- if (lc_ctype_is_c ())
137
+ if (clen == 1 || lc_ctype_is_c ())
138
+ return isdigit (TOUCHAR (ptr ));
139
+
140
+ char2wchar (character , 2 , ptr , clen );
141
+
142
+ return iswdigit ((wint_t ) character [0 ]);
143
+ }
144
+
145
+ int
146
+ t_isspace (const char * ptr )
147
+ {
148
+ int clen = pg_mblen (ptr );
149
+ wchar_t character [2 ];
150
+
151
+ if (clen == 1 || lc_ctype_is_c ())
152
+ return isspace (TOUCHAR (ptr ));
153
+
154
+ char2wchar (character , 2 , ptr , clen );
155
+
156
+ return iswspace ((wint_t ) character [0 ]);
157
+ }
158
+
159
+ int
160
+ t_isalpha (const char * ptr )
161
+ {
162
+ int clen = pg_mblen (ptr );
163
+ wchar_t character [2 ];
164
+
165
+ if (clen == 1 || lc_ctype_is_c ())
107
166
return isalpha (TOUCHAR (ptr ));
108
167
109
- char2wchar (character , ptr , 1 );
168
+ char2wchar (character , 2 , ptr , clen );
110
169
111
- return iswalpha ((wint_t ) * character );
170
+ return iswalpha ((wint_t ) character [ 0 ] );
112
171
}
113
172
114
173
int
115
- _t_isprint (const char * ptr )
174
+ t_isprint (const char * ptr )
116
175
{
176
+ int clen = pg_mblen (ptr );
117
177
wchar_t character [2 ];
118
178
119
- if (lc_ctype_is_c ())
179
+ if (clen == 1 || lc_ctype_is_c ())
120
180
return isprint (TOUCHAR (ptr ));
121
181
122
- char2wchar (character , ptr , 1 );
182
+ char2wchar (character , 2 , ptr , clen );
123
183
124
- return iswprint ((wint_t ) * character );
184
+ return iswprint ((wint_t ) character [ 0 ] );
125
185
}
186
+
126
187
#endif /* TS_USE_WIDE */
127
188
128
189
@@ -168,19 +229,27 @@ t_readline(FILE *fp)
168
229
return recoded ;
169
230
}
170
231
232
+ /*
233
+ * lowerstr --- fold null-terminated string to lower case
234
+ *
235
+ * Returned string is palloc'd
236
+ */
171
237
char *
172
- lowerstr (char * str )
238
+ lowerstr (const char * str )
173
239
{
174
240
return lowerstr_with_len (str , strlen (str ));
175
241
}
176
242
177
243
/*
244
+ * lowerstr_with_len --- fold string to lower case
245
+ *
246
+ * Input string need not be null-terminated.
247
+ *
178
248
* Returned string is palloc'd
179
249
*/
180
250
char *
181
- lowerstr_with_len (char * str , int len )
251
+ lowerstr_with_len (const char * str , int len )
182
252
{
183
- char * ptr = str ;
184
253
char * out ;
185
254
186
255
if (len == 0 )
@@ -202,23 +271,13 @@ lowerstr_with_len(char *str, int len)
202
271
203
272
/*
204
273
* alloc number of wchar_t for worst case, len contains number of
205
- * bytes < = number of characters and alloc 1 wchar_t for 0, because
206
- * wchar2char(wcstombs in really) wants zero-terminated string
274
+ * bytes > = number of characters and alloc 1 wchar_t for 0, because
275
+ * wchar2char wants zero-terminated string
207
276
*/
208
277
wptr = wstr = (wchar_t * ) palloc (sizeof (wchar_t ) * (len + 1 ));
209
278
210
- /*
211
- * str SHOULD be cstring, so wlen contains number of converted
212
- * character
213
- */
214
- wlen = char2wchar (wstr , str , len );
215
- if (wlen < 0 )
216
- ereport (ERROR ,
217
- (errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
218
- errmsg ("translation failed from server encoding to wchar_t" )));
219
-
279
+ wlen = char2wchar (wstr , len + 1 , str , len );
220
280
Assert (wlen <= len );
221
- wstr [wlen ] = 0 ;
222
281
223
282
while (* wptr )
224
283
{
@@ -229,31 +288,29 @@ lowerstr_with_len(char *str, int len)
229
288
/*
230
289
* Alloc result string for worst case + '\0'
231
290
*/
232
- len = sizeof ( char ) * pg_database_encoding_max_length () * ( wlen + 1 ) ;
291
+ len = pg_database_encoding_max_length () * wlen + 1 ;
233
292
out = (char * ) palloc (len );
234
293
235
- /*
236
- * wlen now is number of bytes which is always >= number of characters
237
- */
238
294
wlen = wchar2char (out , wstr , len );
295
+
239
296
pfree (wstr );
240
297
241
298
if (wlen < 0 )
242
299
ereport (ERROR ,
243
300
(errcode (ERRCODE_CHARACTER_NOT_IN_REPERTOIRE ),
244
- errmsg ("translation failed from wchar_t to server encoding %d" , errno )));
245
- Assert (wlen <= len );
246
- out [wlen ] = '\0' ;
301
+ errmsg ("translation from wchar_t to server encoding failed: %m" )));
302
+ Assert (wlen < len );
247
303
}
248
304
else
249
- #endif
305
+ #endif /* TS_USE_WIDE */
250
306
{
307
+ const char * ptr = str ;
251
308
char * outptr ;
252
309
253
310
outptr = out = (char * ) palloc (sizeof (char ) * (len + 1 ));
254
- while (* ptr && ptr - str < len )
311
+ while (( ptr - str ) < len && * ptr )
255
312
{
256
- * outptr ++ = tolower (* ( unsigned char * ) ptr );
313
+ * outptr ++ = tolower (TOUCHAR ( ptr ) );
257
314
ptr ++ ;
258
315
}
259
316
* outptr = '\0' ;
0 commit comments