Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 9bbf6f7

Browse files
committed
Prevent regexp back-refs from sometimes matching when they shouldn't.
The recursion in cdissect() was careless about clearing match data for capturing parentheses after rejecting a partial match. This could allow a later back-reference to succeed when by rights it should fail for lack of a defined referent. To fix, think a little more rigorously about what the contract between different levels of cdissect's recursion needs to be. With the right spec, we can fix this using fewer rather than more resets of the match data; the key decision being that a failed sub-match is now explicitly responsible for clearing any matches it may have set. There are enough other cross-checks and optimizations in the code that it's not especially easy to exhibit this problem; usually, the match will fail as-expected. Plus, regexps that are even potentially vulnerable are most likely user errors, since there's just not much point in writing a back-ref that doesn't always have a referent. These facts perhaps explain why the issue hasn't been detected, even though it's almost certainly a couple of decades old. Discussion: https://postgr.es/m/151435.1629733387@sss.pgh.pa.us
1 parent 515e3d8 commit 9bbf6f7

File tree

5 files changed

+71
-12
lines changed

5 files changed

+71
-12
lines changed

src/backend/regex/regexec.c

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -216,14 +216,14 @@ pg_regexec(regex_t *re,
216216
if (backref && nmatch <= v->g->nsub)
217217
{
218218
/* need larger work area */
219-
if (v->g->nsub + 1 <= LOCALMAT)
219+
v->nmatch = v->g->nsub + 1;
220+
if (v->nmatch <= LOCALMAT)
220221
v->pmatch = mat;
221222
else
222-
v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) *
223-
sizeof(regmatch_t));
223+
v->pmatch = (regmatch_t *) MALLOC(v->nmatch * sizeof(regmatch_t));
224224
if (v->pmatch == NULL)
225225
return REG_ESPACE;
226-
v->nmatch = v->g->nsub + 1;
226+
zapallsubs(v->pmatch, v->nmatch);
227227
}
228228
else
229229
{
@@ -488,7 +488,6 @@ find(struct vars *v,
488488
return REG_OKAY;
489489

490490
/* find submatches */
491-
zapallsubs(v->pmatch, v->nmatch);
492491
return cdissect(v, v->g->tree, begin, end);
493492
}
494493

@@ -599,7 +598,6 @@ cfindloop(struct vars *v,
599598
break; /* no match with this begin point, try next */
600599
MDEBUG(("tentative end %ld\n", LOFF(end)));
601600
/* Dissect the potential match to see if it really matches */
602-
zapallsubs(v->pmatch, v->nmatch);
603601
er = cdissect(v, v->g->tree, begin, end);
604602
if (er == REG_OKAY)
605603
{
@@ -647,6 +645,8 @@ cfindloop(struct vars *v,
647645

648646
/*
649647
* zapallsubs - initialize all subexpression matches to "no match"
648+
*
649+
* Note that p[0], the overall-match location, is not touched.
650650
*/
651651
static void
652652
zapallsubs(regmatch_t *p,
@@ -716,8 +716,30 @@ subset(struct vars *v,
716716
* DFA and found that the proposed substring satisfies the DFA. (We make
717717
* the caller do that because in concatenation and iteration nodes, it's
718718
* much faster to check all the substrings against the child DFAs before we
719-
* recurse.) Also, caller must have cleared subexpression match data via
720-
* zaptreesubs (or zapallsubs at the top level).
719+
* recurse.)
720+
*
721+
* A side-effect of a successful match is to save match locations for
722+
* capturing subexpressions in v->pmatch[]. This is a little bit tricky,
723+
* so we make the following rules:
724+
* 1. Before initial entry to cdissect, all match data must have been
725+
* cleared (this is seen to by zapallsubs).
726+
* 2. Before any recursive entry to cdissect, the match data for that
727+
* subexpression tree must be guaranteed clear (see zaptreesubs).
728+
* 3. When returning REG_OKAY, each level of cdissect will have saved
729+
* any relevant match locations.
730+
* 4. When returning REG_NOMATCH, each level of cdissect will guarantee
731+
* that its subexpression match locations are again clear.
732+
* 5. No guarantees are made for error cases (i.e., other result codes).
733+
* 6. When a level of cdissect abandons a successful sub-match, it will
734+
* clear that subtree's match locations with zaptreesubs before trying
735+
* any new DFA match or cdissect call for that subtree or any subtree
736+
* to its right (that is, any subtree that could have a backref into the
737+
* abandoned match).
738+
* This may seem overly complicated, but it's difficult to simplify it
739+
* because of the provision that match locations must be reset before
740+
* any fresh DFA match (a rule that is needed to make dfa_backref safe).
741+
* That means it won't work to just reset relevant match locations at the
742+
* start of each cdissect level.
721743
*/
722744
static int /* regexec return code */
723745
cdissect(struct vars *v,
@@ -841,6 +863,8 @@ ccondissect(struct vars *v,
841863
MDEBUG(("%d: successful\n", t->id));
842864
return REG_OKAY;
843865
}
866+
/* Reset left's matches (right should have done so itself) */
867+
zaptreesubs(v, left);
844868
}
845869
if (er != REG_NOMATCH)
846870
return er;
@@ -863,8 +887,6 @@ ccondissect(struct vars *v,
863887
return REG_NOMATCH;
864888
}
865889
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
866-
zaptreesubs(v, left);
867-
zaptreesubs(v, right);
868890
}
869891

870892
/* can't get here */
@@ -922,6 +944,8 @@ crevcondissect(struct vars *v,
922944
MDEBUG(("%d: successful\n", t->id));
923945
return REG_OKAY;
924946
}
947+
/* Reset left's matches (right should have done so itself) */
948+
zaptreesubs(v, left);
925949
}
926950
if (er != REG_NOMATCH)
927951
return er;
@@ -944,8 +968,6 @@ crevcondissect(struct vars *v,
944968
return REG_NOMATCH;
945969
}
946970
MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
947-
zaptreesubs(v, left);
948-
zaptreesubs(v, right);
949971
}
950972

951973
/* can't get here */
@@ -1214,6 +1236,7 @@ citerdissect(struct vars *v,
12141236

12151237
for (i = nverified + 1; i <= k; i++)
12161238
{
1239+
/* zap any match data from a non-last iteration */
12171240
zaptreesubs(v, t->child);
12181241
er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
12191242
if (er == REG_OKAY)
@@ -1426,6 +1449,7 @@ creviterdissect(struct vars *v,
14261449

14271450
for (i = nverified + 1; i <= k; i++)
14281451
{
1452+
/* zap any match data from a non-last iteration */
14291453
zaptreesubs(v, t->child);
14301454
er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
14311455
if (er == REG_OKAY)

src/test/modules/test_regex/expected/test_regex.out

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2664,6 +2664,20 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP');
26642664
{2,REG_UBACKREF,REG_UNONPOSIX}
26652665
(1 row)
26662666

2667+
-- expectNomatch 14.30 RP {^(.)\1|\1.} {abcdef}
2668+
select * from test_regex('^(.)\1|\1.', 'abcdef', 'RP');
2669+
test_regex
2670+
--------------------------------
2671+
{1,REG_UBACKREF,REG_UNONPOSIX}
2672+
(1 row)
2673+
2674+
-- expectNomatch 14.31 RP {^((.)\2|..)\2} {abadef}
2675+
select * from test_regex('^((.)\2|..)\2', 'abadef', 'RP');
2676+
test_regex
2677+
--------------------------------
2678+
{2,REG_UBACKREF,REG_UNONPOSIX}
2679+
(1 row)
2680+
26672681
-- back reference only matches the string, not any constraints
26682682
select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
26692683
test_regex

src/test/modules/test_regex/sql/test_regex.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,10 @@ select * from test_regex('^(.+)( \1)+$', 'abc abc abc', 'RP');
775775
select * from test_regex('^(.+)( \1)+$', 'abc abd abc', 'RP');
776776
-- expectNomatch 14.29 RP {^(.+)( \1)+$} {abc abc abd}
777777
select * from test_regex('^(.+)( \1)+$', 'abc abc abd', 'RP');
778+
-- expectNomatch 14.30 RP {^(.)\1|\1.} {abcdef}
779+
select * from test_regex('^(.)\1|\1.', 'abcdef', 'RP');
780+
-- expectNomatch 14.31 RP {^((.)\2|..)\2} {abadef}
781+
select * from test_regex('^((.)\2|..)\2', 'abadef', 'RP');
778782

779783
-- back reference only matches the string, not any constraints
780784
select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');

src/test/regress/expected/regex.out

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,19 @@ select 'a' ~ '()+\1';
567567
t
568568
(1 row)
569569

570+
-- Test ancient oversight in when to apply zaptreesubs
571+
select 'abcdef' ~ '^(.)\1|\1.' as f;
572+
f
573+
---
574+
f
575+
(1 row)
576+
577+
select 'abadef' ~ '^((.)\2|..)\2' as f;
578+
f
579+
---
580+
f
581+
(1 row)
582+
570583
-- Add coverage for some cases in checkmatchall
571584
select regexp_match('xy', '.|...');
572585
regexp_match

src/test/regress/sql/regex.sql

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ select 'a' ~ '.. ()|\1';
135135
select 'a' ~ '()*\1';
136136
select 'a' ~ '()+\1';
137137

138+
-- Test ancient oversight in when to apply zaptreesubs
139+
select 'abcdef' ~ '^(.)\1|\1.' as f;
140+
select 'abadef' ~ '^((.)\2|..)\2' as f;
141+
138142
-- Add coverage for some cases in checkmatchall
139143
select regexp_match('xy', '.|...');
140144
select regexp_match('xyz', '.|...');

0 commit comments

Comments
 (0)