Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c50d192

Browse files
committed
Fix ts_delete(tsvector, text[]) to cope with duplicate array entries.
Such cases either failed an Assert, or produced a corrupt tsvector in non-Assert builds, as reported by Andreas Seltenreich. The reason is that tsvector_delete_by_indices() just assumed that its input array had no duplicates. Fix by explicitly de-duping. In passing, improve some comments, and fix a number of tests for null values to use ERRCODE_NULL_VALUE_NOT_ALLOWED not ERRCODE_INVALID_PARAMETER_VALUE. Discussion: <87invhoj6e.fsf@credativ.de>
1 parent 33fe736 commit c50d192

File tree

3 files changed

+53
-31
lines changed

3 files changed

+53
-31
lines changed

src/backend/utils/adt/tsvector_op.c

Lines changed: 46 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
317317

318318
if (nulls[i])
319319
ereport(ERROR,
320-
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
320+
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
321321
errmsg("lexeme array may not contain nulls")));
322322

323323
lex = VARDATA(dlexemes[i]);
@@ -430,7 +430,7 @@ compareint(const void *va, const void *vb)
430430
/*
431431
* Internal routine to delete lexemes from TSVector by array of offsets.
432432
*
433-
* int *indices_to_delete -- array of lexeme offsets to delete
433+
* int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
434434
* int indices_count -- size of that array
435435
*
436436
* Returns new TSVector without given lexemes along with their positions
@@ -445,52 +445,68 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
445445
*arrout;
446446
char *data = STRPTR(tsv),
447447
*dataout;
448-
int i,
449-
j,
450-
k,
451-
curoff;
448+
int i, /* index in arrin */
449+
j, /* index in arrout */
450+
k, /* index in indices_to_delete */
451+
curoff; /* index in dataout area */
452452

453453
/*
454-
* Here we overestimates tsout size, since we don't know exact size
455-
* occupied by positions and weights. We will set exact size later after a
456-
* pass through TSVector.
454+
* Sort the filter array to simplify membership checks below. Also, get
455+
* rid of any duplicate entries, so that we can assume that indices_count
456+
* is exactly equal to the number of lexemes that will be removed.
457457
*/
458-
tsout = (TSVector) palloc0(VARSIZE(tsv));
459-
arrout = ARRPTR(tsout);
460-
tsout->size = tsv->size - indices_count;
461-
462-
/* Sort our filter array to simplify membership check later. */
463458
if (indices_count > 1)
459+
{
460+
int kp;
461+
464462
qsort(indices_to_delete, indices_count, sizeof(int), compareint);
463+
kp = 0;
464+
for (k = 1; k < indices_count; k++)
465+
{
466+
if (indices_to_delete[k] != indices_to_delete[kp])
467+
indices_to_delete[++kp] = indices_to_delete[k];
468+
}
469+
indices_count = ++kp;
470+
}
465471

466472
/*
467-
* Copy tsv to tsout skipping lexemes that enlisted in indices_to_delete.
473+
* Here we overestimate tsout size, since we don't know how much space is
474+
* used by the deleted lexeme(s). We will set exact size below.
468475
*/
469-
curoff = 0;
476+
tsout = (TSVector) palloc0(VARSIZE(tsv));
477+
478+
/* This count must be correct because STRPTR(tsout) relies on it. */
479+
tsout->size = tsv->size - indices_count;
480+
481+
/*
482+
* Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
483+
*/
484+
arrout = ARRPTR(tsout);
470485
dataout = STRPTR(tsout);
486+
curoff = 0;
471487
for (i = j = k = 0; i < tsv->size; i++)
472488
{
473489
/*
474-
* Here we should check whether current i is present in
475-
* indices_to_delete or not. Since indices_to_delete is already sorted
476-
* we can advance it index only when we have match.
490+
* If current i is present in indices_to_delete, skip this lexeme.
491+
* Since indices_to_delete is already sorted, we only need to check
492+
* the current (k'th) entry.
477493
*/
478494
if (k < indices_count && i == indices_to_delete[k])
479495
{
480496
k++;
481497
continue;
482498
}
483499

484-
/* Copy lexeme, it's positions and weights */
500+
/* Copy lexeme and its positions and weights */
485501
memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
486502
arrout[j].haspos = arrin[i].haspos;
487503
arrout[j].len = arrin[i].len;
488504
arrout[j].pos = curoff;
489505
curoff += arrin[i].len;
490506
if (arrin[i].haspos)
491507
{
492-
int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos) +
493-
sizeof(uint16);
508+
int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
509+
+ sizeof(uint16);
494510

495511
curoff = SHORTALIGN(curoff);
496512
memcpy(dataout + curoff,
@@ -503,10 +519,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
503519
}
504520

505521
/*
506-
* After the pass through TSVector k should equals exactly to
507-
* indices_count. If it isn't then the caller provided us with indices
508-
* outside of [0, tsv->size) range and estimation of tsout's size is
509-
* wrong.
522+
* k should now be exactly equal to indices_count. If it isn't then the
523+
* caller provided us with indices outside of [0, tsv->size) range and
524+
* estimation of tsout's size is wrong.
510525
*/
511526
Assert(k == indices_count);
512527

@@ -560,7 +575,7 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
560575

561576
/*
562577
* In typical use case array of lexemes to delete is relatively small. So
563-
* here we optimizing things for that scenario: iterate through lexarr
578+
* here we optimize things for that scenario: iterate through lexarr
564579
* performing binary search of each lexeme from lexarr in tsvector.
565580
*/
566581
skip_indices = palloc0(nlex * sizeof(int));
@@ -572,10 +587,10 @@ tsvector_delete_arr(PG_FUNCTION_ARGS)
572587

573588
if (nulls[i])
574589
ereport(ERROR,
575-
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
590+
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
576591
errmsg("lexeme array may not contain nulls")));
577592

578-
lex = VARDATA(dlexemes[i]);
593+
lex = VARDATA_ANY(dlexemes[i]);
579594
lex_len = VARSIZE_ANY_EXHDR(dlexemes[i]);
580595
lex_pos = tsvector_bsearch(tsin, lex, lex_len);
581596

@@ -738,7 +753,7 @@ array_to_tsvector(PG_FUNCTION_ARGS)
738753
{
739754
if (nulls[i])
740755
ereport(ERROR,
741-
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
756+
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
742757
errmsg("lexeme array may not contain nulls")));
743758

744759
datalen += VARSIZE_ANY_EXHDR(dlexemes[i]);
@@ -797,7 +812,7 @@ tsvector_filter(PG_FUNCTION_ARGS)
797812

798813
if (nulls[i])
799814
ereport(ERROR,
800-
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
815+
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
801816
errmsg("weight array may not contain nulls")));
802817

803818
char_weight = DatumGetChar(dweights[i]);

src/test/regress/expected/tstypes.out

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,12 @@ SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceshi
10871087
'base' 'hidden' 'strike'
10881088
(1 row)
10891089

1090+
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
1091+
ts_delete
1092+
--------------------------
1093+
'base' 'hidden' 'strike'
1094+
(1 row)
1095+
10901096
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
10911097
ERROR: lexeme array may not contain nulls
10921098
SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);

src/test/regress/sql/tstypes.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3':
212212
SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceshi','rebel']);
213213
SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']);
214214
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']);
215+
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']);
215216
SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]);
216217

217218
SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector);

0 commit comments

Comments
 (0)