Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit bb36c51

Browse files
committed
Fix several bugs in tsvectorin, including crash due to uninitialized field and
miscomputation of required palloc size. The crash could only occur if the input contained lexemes both with and without positions, which is probably not common in practice. The miscomputation would definitely result in wasted space. Also fix some inconsistent coding around alignment of strings and positions in a tsvector value; these errors could also lead to crashes given mixed with/without position data and a machine that's picky about alignment. And be more careful about checking for overflow of string offsets. Patch is only against HEAD --- I have not looked to see if same bugs are in back-branch contrib/tsearch2 code.
1 parent f551348 commit bb36c51

File tree

3 files changed

+166
-126
lines changed

3 files changed

+166
-126
lines changed

src/backend/tsearch/to_tsany.c

+26-17
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.4 2007/09/26 10:09:57 teodor Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.5 2007/10/23 00:51:23 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -140,55 +140,64 @@ uniqueWORD(ParsedWord * a, int4 l)
140140
TSVector
141141
make_tsvector(ParsedText *prs)
142142
{
143-
int4 i,
143+
int i,
144144
j,
145145
lenstr = 0,
146146
totallen;
147147
TSVector in;
148148
WordEntry *ptr;
149-
char *str,
150-
*cur;
149+
char *str;
150+
int stroff;
151151

152152
prs->curwords = uniqueWORD(prs->words, prs->curwords);
153153
for (i = 0; i < prs->curwords; i++)
154154
{
155-
lenstr += SHORTALIGN(prs->words[i].len);
156-
155+
lenstr += prs->words[i].len;
157156
if (prs->words[i].alen)
157+
{
158+
lenstr = SHORTALIGN(lenstr);
158159
lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
160+
}
159161
}
160162

163+
if (lenstr > MAXSTRPOS)
164+
ereport(ERROR,
165+
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
166+
errmsg("string is too long for tsvector")));
167+
161168
totallen = CALCDATASIZE(prs->curwords, lenstr);
162169
in = (TSVector) palloc0(totallen);
163170
SET_VARSIZE(in, totallen);
164171
in->size = prs->curwords;
165172

166173
ptr = ARRPTR(in);
167-
cur = str = STRPTR(in);
174+
str = STRPTR(in);
175+
stroff = 0;
168176
for (i = 0; i < prs->curwords; i++)
169177
{
170178
ptr->len = prs->words[i].len;
171-
if (cur - str > MAXSTRPOS)
172-
ereport(ERROR,
173-
(errcode(ERRCODE_SYNTAX_ERROR),
174-
errmsg("string is too long for tsvector")));
175-
ptr->pos = cur - str;
176-
memcpy((void *) cur, (void *) prs->words[i].word, prs->words[i].len);
179+
ptr->pos = stroff;
180+
memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
181+
stroff += prs->words[i].len;
177182
pfree(prs->words[i].word);
178-
cur += SHORTALIGN(prs->words[i].len);
179183
if (prs->words[i].alen)
180184
{
185+
int k = prs->words[i].pos.apos[0];
181186
WordEntryPos *wptr;
182187

188+
if (k > 0xFFFF)
189+
elog(ERROR, "positions array too long");
190+
183191
ptr->haspos = 1;
184-
*(uint16 *) cur = prs->words[i].pos.apos[0];
192+
stroff = SHORTALIGN(stroff);
193+
*(uint16 *) (str + stroff) = (uint16) k;
185194
wptr = POSDATAPTR(in, ptr);
186-
for (j = 0; j < *(uint16 *) cur; j++)
195+
for (j = 0; j < k; j++)
187196
{
188197
WEP_SETWEIGHT(wptr[j], 0);
189198
WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
190199
}
191-
cur += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
200+
stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
192201
pfree(prs->words[i].pos.apos);
193202
}
194203
else

src/backend/utils/adt/tsvector.c

+74-63
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.5 2007/10/21 22:29:56 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.6 2007/10/23 00:51:23 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -22,16 +22,18 @@
2222

2323
typedef struct
2424
{
25-
WordEntry entry; /* should be first ! */
25+
WordEntry entry; /* must be first! */
2626
WordEntryPos *pos;
2727
int poslen; /* number of elements in pos */
2828
} WordEntryIN;
2929

30+
31+
/* Compare two WordEntryPos values for qsort */
3032
static int
3133
comparePos(const void *a, const void *b)
3234
{
33-
int apos = WEP_GETPOS(*(WordEntryPos *) a);
34-
int bpos = WEP_GETPOS(*(WordEntryPos *) b);
35+
int apos = WEP_GETPOS(*(const WordEntryPos *) a);
36+
int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
3537

3638
if (apos == bpos)
3739
return 0;
@@ -53,17 +55,18 @@ uniquePos(WordEntryPos * a, int l)
5355
if (l <= 1)
5456
return l;
5557

56-
res = a;
5758
qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
5859

60+
res = a;
5961
ptr = a + 1;
6062
while (ptr - a < l)
6163
{
6264
if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
6365
{
6466
res++;
6567
*res = *ptr;
66-
if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1)
68+
if (res - a >= MAXNUMPOS - 1 ||
69+
WEP_GETPOS(*res) == MAXENTRYPOS - 1)
6770
break;
6871
}
6972
else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
@@ -74,12 +77,13 @@ uniquePos(WordEntryPos * a, int l)
7477
return res + 1 - a;
7578
}
7679

80+
/* Compare two WordEntryIN values for qsort */
7781
static int
7882
compareentry(const void *va, const void *vb, void *arg)
7983
{
84+
const WordEntryIN *a = (const WordEntryIN *) va;
85+
const WordEntryIN *b = (const WordEntryIN *) vb;
8086
char *BufferStr = (char *) arg;
81-
WordEntryIN *a = (WordEntryIN *) va;
82-
WordEntryIN *b = (WordEntryIN *) vb;
8387

8488
if (a->entry.len == b->entry.len)
8589
{
@@ -91,82 +95,78 @@ compareentry(const void *va, const void *vb, void *arg)
9195
return (a->entry.len > b->entry.len) ? 1 : -1;
9296
}
9397

98+
/*
99+
* Sort an array of WordEntryIN, remove duplicates.
100+
* *outbuflen receives the amount of space needed for strings and positions.
101+
*/
94102
static int
95103
uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
96104
{
105+
int buflen;
97106
WordEntryIN *ptr,
98107
*res;
99108

100109
Assert(l >= 1);
101110

102-
if (l == 1)
103-
{
104-
if (a->entry.haspos)
105-
{
106-
a->poslen = uniquePos(a->pos, a->poslen);
107-
*outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos);
108-
}
109-
else
110-
*outbuflen = a->entry.len;
111+
if (l > 1)
112+
qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
113+
(void *) buf);
111114

112-
return l;
113-
}
115+
buflen = 0;
114116
res = a;
115-
116117
ptr = a + 1;
117-
qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf);
118-
119118
while (ptr - a < l)
120119
{
121120
if (!(ptr->entry.len == res->entry.len &&
122-
strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0))
121+
strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
122+
res->entry.len) == 0))
123123
{
124+
/* done accumulating data into *res, count space needed */
125+
buflen += res->entry.len;
124126
if (res->entry.haspos)
125127
{
126-
*outbuflen += SHORTALIGN(res->entry.len);
127128
res->poslen = uniquePos(res->pos, res->poslen);
128-
*outbuflen += res->poslen * sizeof(WordEntryPos);
129+
buflen = SHORTALIGN(buflen);
130+
buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
129131
}
130-
else
131-
*outbuflen += res->entry.len;
132132
res++;
133133
memcpy(res, ptr, sizeof(WordEntryIN));
134134
}
135135
else if (ptr->entry.haspos)
136136
{
137137
if (res->entry.haspos)
138138
{
139+
/* append ptr's positions to res's positions */
139140
int newlen = ptr->poslen + res->poslen;
140141

141-
/* Append res to pos */
142-
143-
res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos));
144-
memcpy(&res->pos[res->poslen],
145-
ptr->pos, ptr->poslen * sizeof(WordEntryPos));
142+
res->pos = (WordEntryPos *)
143+
repalloc(res->pos, newlen * sizeof(WordEntryPos));
144+
memcpy(&res->pos[res->poslen], ptr->pos,
145+
ptr->poslen * sizeof(WordEntryPos));
146146
res->poslen = newlen;
147147
pfree(ptr->pos);
148148
}
149149
else
150150
{
151+
/* just give ptr's positions to pos */
151152
res->entry.haspos = 1;
152153
res->pos = ptr->pos;
154+
res->poslen = ptr->poslen;
153155
}
154156
}
155157
ptr++;
156158
}
157159

158-
/* add last item */
159-
160+
/* count space needed for last item */
161+
buflen += res->entry.len;
160162
if (res->entry.haspos)
161163
{
162-
*outbuflen += SHORTALIGN(res->entry.len);
163-
164164
res->poslen = uniquePos(res->pos, res->poslen);
165-
*outbuflen += res->poslen * sizeof(WordEntryPos);
165+
buflen = SHORTALIGN(buflen);
166+
buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
166167
}
167-
else
168-
*outbuflen += res->entry.len;
169168

169+
*outbuflen = buflen;
170170
return res + 1 - a;
171171
}
172172

@@ -193,6 +193,8 @@ tsvectorin(PG_FUNCTION_ARGS)
193193
int toklen;
194194
WordEntryPos *pos;
195195
int poslen;
196+
char *strbuf;
197+
int stroff;
196198

197199
/*
198200
* Tokens are appended to tmpbuf, cur is a pointer
@@ -212,27 +214,26 @@ tsvectorin(PG_FUNCTION_ARGS)
212214

213215
while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
214216
{
215-
216217
if (toklen >= MAXSTRLEN)
217218
ereport(ERROR,
218-
(errcode(ERRCODE_SYNTAX_ERROR),
219+
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
219220
errmsg("word is too long (%ld bytes, max %ld bytes)",
220221
(long) toklen,
221-
(long) MAXSTRLEN)));
222-
222+
(long) (MAXSTRLEN-1))));
223223

224224
if (cur - tmpbuf > MAXSTRPOS)
225225
ereport(ERROR,
226-
(errcode(ERRCODE_SYNTAX_ERROR),
227-
errmsg("position value is too large")));
226+
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
227+
errmsg("string is too long for tsvector")));
228228

229229
/*
230230
* Enlarge buffers if needed
231231
*/
232232
if (len >= arrlen)
233233
{
234234
arrlen *= 2;
235-
arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
235+
arr = (WordEntryIN *)
236+
repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
236237
}
237238
while ((cur - tmpbuf) + toklen >= buflen)
238239
{
@@ -254,7 +255,11 @@ tsvectorin(PG_FUNCTION_ARGS)
254255
arr[len].poslen = poslen;
255256
}
256257
else
258+
{
257259
arr[len].entry.haspos = 0;
260+
arr[len].pos = NULL;
261+
arr[len].poslen = 0;
262+
}
258263
len++;
259264
}
260265

@@ -264,40 +269,45 @@ tsvectorin(PG_FUNCTION_ARGS)
264269
len = uniqueentry(arr, len, tmpbuf, &buflen);
265270
else
266271
buflen = 0;
272+
273+
if (buflen > MAXSTRPOS)
274+
ereport(ERROR,
275+
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
276+
errmsg("string is too long for tsvector")));
277+
267278
totallen = CALCDATASIZE(len, buflen);
268279
in = (TSVector) palloc0(totallen);
269-
270280
SET_VARSIZE(in, totallen);
271281
in->size = len;
272-
cur = STRPTR(in);
273282
inarr = ARRPTR(in);
283+
strbuf = STRPTR(in);
284+
stroff = 0;
274285
for (i = 0; i < len; i++)
275286
{
276-
memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
277-
arr[i].entry.pos = cur - STRPTR(in);
278-
cur += SHORTALIGN(arr[i].entry.len);
287+
memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
288+
arr[i].entry.pos = stroff;
289+
stroff += arr[i].entry.len;
279290
if (arr[i].entry.haspos)
280291
{
281-
uint16 tmplen;
282-
283-
if(arr[i].poslen > 0xFFFF)
292+
if (arr[i].poslen > 0xFFFF)
284293
elog(ERROR, "positions array too long");
285294

286-
tmplen = (uint16) arr[i].poslen;
287-
288-
/* Copy length to output struct */
289-
memcpy(cur, &tmplen, sizeof(uint16));
290-
cur += sizeof(uint16);
295+
/* Copy number of positions */
296+
stroff = SHORTALIGN(stroff);
297+
*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
298+
stroff += sizeof(uint16);
291299

292300
/* Copy positions */
293-
memcpy(cur, arr[i].pos, (arr[i].poslen) * sizeof(WordEntryPos));
294-
cur += arr[i].poslen * sizeof(WordEntryPos);
301+
memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
302+
stroff += arr[i].poslen * sizeof(WordEntryPos);
295303

296304
pfree(arr[i].pos);
297305
}
298306
inarr[i] = arr[i].entry;
299307
}
300308

309+
Assert((strbuf + stroff - (char *) in) == totallen);
310+
301311
PG_RETURN_TSVECTOR(in);
302312
}
303313

@@ -495,11 +505,12 @@ tsvectorrecv(PG_FUNCTION_ARGS)
495505

496506
datalen += lex_len;
497507

498-
if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0)
508+
if (i > 0 && WordEntryCMP(&vec->entries[i],
509+
&vec->entries[i - 1],
510+
STRPTR(vec)) <= 0)
499511
elog(ERROR, "lexemes are misordered");
500512

501513
/* Receive positions */
502-
503514
if (npos > 0)
504515
{
505516
uint16 j;

0 commit comments

Comments
 (0)