Skip to content

Commit b86b415

Browse files
committed
Add capture group range to stored loop state and properly reset capture groups before each repetition
1 parent 1d225e8 commit b86b415

File tree

2 files changed

+175
-31
lines changed

2 files changed

+175
-31
lines changed

stl/inc/regex

Lines changed: 153 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1493,9 +1493,10 @@ public:
14931493
_Node_end_rep& operator=(const _Node_end_rep&) = delete;
14941494
};
14951495

1496-
struct _Loop_vals_t { // storage for loop administration
1497-
int _Loop_idx;
1496+
struct _Loop_vals_v2_t { // storage for loop administration
14981497
void* _Loop_iter;
1498+
int _Loop_idx;
1499+
unsigned int _Group_first;
14991500
};
15001501

15011502
class _Node_rep : public _Node_base { // node that marks the beginning of a repetition
@@ -1681,13 +1682,15 @@ public:
16811682
private:
16821683
_Tgt_state_t<_It> _Tgt_state;
16831684
_Tgt_state_t<_It> _Res;
1684-
vector<_Loop_vals_t> _Loop_vals;
1685+
vector<_Loop_vals_v2_t> _Loop_vals;
16851686

16861687
bool _Do_assert(_Node_assert*);
16871688
bool _Do_neg_assert(_Node_assert*);
16881689
bool _Do_if(_Node_if*);
16891690
bool _Do_rep0(_Node_rep*, bool);
16901691
bool _Do_rep(_Node_rep*, bool, int);
1692+
bool _Do_rep_first(_Node_rep*);
1693+
bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*);
16911694
bool _Do_class(_Node_base*);
16921695
bool _Match_pat(_Node_base*);
16931696
bool _Better_match();
@@ -3235,6 +3238,13 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
32353238
_Tgt_state_t<_It> _St = _Tgt_state;
32363239

32373240
for (; _Ix < _Node->_Min; ++_Ix) { // do minimum number of reps
3241+
// GH-5365: We have to reset the capture groups from the second iteration on.
3242+
// We can avoid the reset for the first iteration
3243+
// because we know that a simple repetition was not encountered before.
3244+
if (_Ix > 0) {
3245+
_Tgt_state._Grp_valid = _St._Grp_valid;
3246+
}
3247+
32383248
_It _Cur = _Tgt_state._Cur;
32393249
if (!_Match_pat(_Node->_Next)) { // didn't match minimum number of reps, fail
32403250
_Tgt_state = _St;
@@ -3290,17 +3300,12 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
32903300
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
32913301
bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, bool _Greedy, int _Init_idx) {
32923302
// apply repetition
3293-
if (_Node->_Simple_loop == 1) {
3294-
return _Do_rep0(_Node, _Greedy);
3295-
}
3296-
3297-
bool _Matched0 = false;
3298-
_Tgt_state_t<_It> _St = _Tgt_state;
3299-
_Loop_vals_t* _Psav = &_Loop_vals[_Node->_Loop_number];
3300-
int _Loop_idx_sav = _Psav->_Loop_idx;
3301-
_It* _Loop_iter_sav = static_cast<_It*>(_Psav->_Loop_iter);
3302-
3303-
bool _Progress = _Init_idx == 0 || *_Loop_iter_sav != _St._Cur;
3303+
bool _Matched0 = false;
3304+
_Tgt_state_t<_It> _St = _Tgt_state;
3305+
_Loop_vals_v2_t* _Psav = &_Loop_vals[_Node->_Loop_number];
3306+
int _Loop_idx_sav = _Psav->_Loop_idx;
3307+
_It* _Loop_iter_sav = static_cast<_It*>(_Psav->_Loop_iter);
3308+
bool _Progress = _Init_idx == 0 || *_Loop_iter_sav != _St._Cur;
33043309

33053310
if (0 <= _Node->_Max && _Node->_Max <= _Init_idx) {
33063311
_Matched0 = _Match_pat(_Node->_End_rep->_Next); // reps done, try tail
@@ -3310,7 +3315,9 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node,
33103315
} else { // try another required match
33113316
_Psav->_Loop_idx = _Init_idx + 1;
33123317
_Psav->_Loop_iter = _STD addressof(_St._Cur);
3313-
_Matched0 = _Match_pat(_Node->_Next);
3318+
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3319+
_Tgt_state._Grp_valid.end(), false);
3320+
_Matched0 = _Match_pat(_Node->_Next);
33143321
}
33153322
} else if (_Longest) { // longest, try any number of repetitions
33163323

@@ -3332,13 +3339,17 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node,
33323339
_Tgt_state = _St;
33333340
_Psav->_Loop_idx = _Init_idx + 1;
33343341
_Psav->_Loop_iter = _STD addressof(_St._Cur);
3335-
_Matched0 = _Match_pat(_Node->_Next);
3342+
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3343+
_Tgt_state._Grp_valid.end(), false);
3344+
_Matched0 = _Match_pat(_Node->_Next);
33363345
}
33373346
} else { // greedy, favor maximum number of reps
33383347
if (_Progress) { // try another rep
33393348
_Psav->_Loop_idx = _Init_idx + 1;
33403349
_Psav->_Loop_iter = _STD addressof(_St._Cur);
3341-
_Matched0 = _Match_pat(_Node->_Next);
3350+
_STD fill(_Tgt_state._Grp_valid.begin() + static_cast<ptrdiff_t>(_Psav->_Group_first),
3351+
_Tgt_state._Grp_valid.end(), false);
3352+
_Matched0 = _Match_pat(_Node->_Next);
33423353
}
33433354

33443355
if ((_Progress || 1 >= _Init_idx) && !_Matched0) { // rep failed, try tail
@@ -3358,6 +3369,127 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node,
33583369
return _Matched0;
33593370
}
33603371

3372+
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
3373+
bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep_first(_Node_rep* _Node) {
3374+
bool _Greedy = (_Node->_Flags & _Fl_greedy) != 0;
3375+
// apply repetition
3376+
if (_Node->_Simple_loop == 1) {
3377+
return _Do_rep0(_Node, _Greedy);
3378+
}
3379+
_Loop_vals_v2_t* _Psav = &_Loop_vals[_Node->_Loop_number];
3380+
3381+
// Determine first capture group in repetition for later capture group reset, if not done so previously.
3382+
// No capture group reset is performed for POSIX regexes,
3383+
// so we prevent any reset by setting the first capture group to the number of capture groups _Ncap.
3384+
if (_Psav->_Group_first == 0) {
3385+
if ((_Sflags
3386+
& (regex_constants::basic | regex_constants::extended | regex_constants::grep | regex_constants::egrep
3387+
| regex_constants::awk))
3388+
|| !_Find_first_inner_capture_group(_Node->_Next, _Psav)) {
3389+
_Psav->_Group_first = _Ncap;
3390+
}
3391+
}
3392+
3393+
return _Do_rep(_Node, _Greedy, 0);
3394+
}
3395+
3396+
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
3397+
bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Find_first_inner_capture_group(
3398+
_Node_base* _Nx, _Loop_vals_v2_t* _Loop_state) {
3399+
if (0 < _Max_stack_count && --_Max_stack_count <= 0) {
3400+
_Xregex_error(regex_constants::error_stack);
3401+
}
3402+
3403+
bool _Found_group = false;
3404+
while (_Nx) {
3405+
switch (_Nx->_Kind) {
3406+
case _N_nop:
3407+
case _N_bol:
3408+
case _N_eol:
3409+
case _N_wbound:
3410+
case _N_dot:
3411+
case _N_str:
3412+
case _N_class:
3413+
case _N_group:
3414+
case _N_end_group:
3415+
case _N_end_capture:
3416+
case _N_back:
3417+
case _N_begin:
3418+
break;
3419+
3420+
case _N_assert:
3421+
case _N_neg_assert:
3422+
{
3423+
if (_Find_first_inner_capture_group(static_cast<_Node_assert*>(_Nx), _Loop_state)) {
3424+
_Found_group = true;
3425+
_Nx = nullptr;
3426+
}
3427+
break;
3428+
}
3429+
3430+
case _N_capture:
3431+
{
3432+
_Node_capture* _Node = static_cast<_Node_capture*>(_Nx);
3433+
_Loop_state->_Group_first = _Node->_Idx;
3434+
_Found_group = true;
3435+
_Nx = nullptr;
3436+
break;
3437+
}
3438+
3439+
case _N_if:
3440+
{
3441+
_Node_if* _Node = static_cast<_Node_if*>(_Nx);
3442+
for (; _Node != nullptr; _Node = _Node->_Child) {
3443+
if (_Find_first_inner_capture_group(_Node->_Next, _Loop_state)) {
3444+
_Found_group = true;
3445+
_Nx = nullptr;
3446+
break;
3447+
}
3448+
}
3449+
3450+
if (_Nx != nullptr) { // continue search after the branches of the _N_if node
3451+
_Nx = static_cast<_Node_if*>(_Nx)->_Endif;
3452+
}
3453+
break;
3454+
}
3455+
3456+
case _N_rep:
3457+
{
3458+
_Node_rep* _Inner_rep = static_cast<_Node_rep*>(_Nx);
3459+
_Loop_vals_v2_t* _Inner_loop_state = &_Loop_vals[_Inner_rep->_Loop_number];
3460+
if (_Find_first_inner_capture_group(_Inner_rep->_Next, _Inner_loop_state)) {
3461+
_Loop_state->_Group_first = _Inner_loop_state->_Group_first;
3462+
_Found_group = true;
3463+
_Nx = nullptr;
3464+
} else {
3465+
_Inner_loop_state->_Group_first = _Ncap;
3466+
_Nx = _Inner_rep->_End_rep;
3467+
}
3468+
break;
3469+
}
3470+
3471+
case _N_end_assert:
3472+
case _N_endif:
3473+
case _N_end_rep:
3474+
case _N_end:
3475+
case _N_none:
3476+
default:
3477+
_Nx = nullptr;
3478+
break;
3479+
}
3480+
3481+
if (_Nx) {
3482+
_Nx = _Nx->_Next;
3483+
}
3484+
}
3485+
3486+
if (0 < _Max_stack_count) {
3487+
++_Max_stack_count;
3488+
}
3489+
3490+
return _Found_group;
3491+
}
3492+
33613493
template <class _BidIt1, class _BidIt2, class _Pr>
33623494
_BidIt1 _Cmp_chrange(_BidIt1 _Begin1, _BidIt1 _End1, _BidIt2 _Begin2, _BidIt2 _End2, _Pr _Pred) {
33633495
// compare character ranges
@@ -3695,15 +3827,6 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
36953827
{ // record current position
36963828
_Node_capture* _Node = static_cast<_Node_capture*>(_Nx);
36973829
_Tgt_state._Grps[_Node->_Idx]._Begin = _Tgt_state._Cur;
3698-
if (!(_Sflags
3699-
& (regex_constants::basic | regex_constants::extended | regex_constants::grep
3700-
| regex_constants::egrep | regex_constants::awk))) {
3701-
// CodeQL [SM02323] Comparing unchanging unsigned int _Node->_Idx to decreasing size_t _Idx is safe.
3702-
for (size_t _Idx = _Tgt_state._Grp_valid.size(); _Node->_Idx < _Idx;) {
3703-
_Tgt_state._Grp_valid[--_Idx] = false;
3704-
}
3705-
}
3706-
37073830
break;
37083831
}
37093832

@@ -3752,7 +3875,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
37523875
break;
37533876

37543877
case _N_rep:
3755-
if (!_Do_rep(static_cast<_Node_rep*>(_Nx), (_Nx->_Flags & _Fl_greedy) != 0, 0)) {
3878+
if (!_Do_rep_first(static_cast<_Node_rep*>(_Nx))) {
37563879
_Failed = true;
37573880
}
37583881

@@ -3761,10 +3884,9 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
37613884

37623885
case _N_end_rep:
37633886
{
3764-
_Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep;
3765-
_Loop_vals_t* _Psav = &_Loop_vals[_Nr->_Loop_number];
3766-
3767-
if (_Nr->_Simple_loop == 0 && !_Do_rep(_Nr, (_Nr->_Flags & _Fl_greedy) != 0, _Psav->_Loop_idx)) {
3887+
_Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep;
3888+
if (_Nr->_Simple_loop == 0
3889+
&& !_Do_rep(_Nr, (_Nr->_Flags & _Fl_greedy) != 0, _Loop_vals[_Nr->_Loop_number]->_Loop_idx)) {
37683890
_Failed = true; // recurse only if loop contains if/do
37693891
}
37703892

tests/std/tests/VSO_0000000_regex_use/test.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,6 +1559,27 @@ void test_gh_5364() {
15591559
g_regexTester.should_match("c", "[^]", ECMAScript);
15601560
}
15611561

1562+
void test_gh_5365() {
1563+
// GH-5365: <regex>: Implementation divergence for capture group behavior:
1564+
// Capture groups were not correctly cleared at the beginning of repetitions in ECMAScript mode.
1565+
{
1566+
test_regex captures_in_repeated_noncapturing_group(&g_regexTester, "^(?:(a)|(b)|(c)|(d))+$");
1567+
captures_in_repeated_noncapturing_group.should_search_match_capture_groups(
1568+
"acbd", "acbd", match_default, {{-1, -1}, {-1, -1}, {-1, -1}, {3, 4}});
1569+
captures_in_repeated_noncapturing_group.should_search_match_capture_groups(
1570+
"adcba", "adcba", match_default, {{4, 5}, {-1, -1}, {-1, -1}, {-1, -1}});
1571+
}
1572+
{
1573+
test_regex captures_in_questionmark_quantifiers(&g_regexTester, "(z)((a+)?(b+)?(c))*");
1574+
captures_in_questionmark_quantifiers.should_search_match_capture_groups(
1575+
"zaacbbbcac", "zaacbbbcac", match_default, {{0, 1}, {8, 10}, {8, 9}, {-1, -1}, {9, 10}});
1576+
captures_in_questionmark_quantifiers.should_search_match_capture_groups(
1577+
"zaacbbbcbbc", "zaacbbbcbbc", match_default, {{0, 1}, {8, 11}, {-1, -1}, {8, 10}, {10, 11}});
1578+
captures_in_questionmark_quantifiers.should_search_match_capture_groups(
1579+
"zaacbbbcabbc", "zaacbbbcabbc", match_default, {{0, 1}, {8, 12}, {8, 9}, {9, 11}, {11, 12}});
1580+
}
1581+
}
1582+
15621583
void test_gh_5371() {
15631584
// GH-5371 <regex>: \b and \B are backwards on empty strings
15641585
g_regexTester.should_not_match("", R"(\b)");
@@ -1664,6 +1685,7 @@ int main() {
16641685
test_gh_5253();
16651686
test_gh_5362();
16661687
test_gh_5364();
1688+
test_gh_5365();
16671689
test_gh_5371();
16681690
test_gh_5374();
16691691
test_gh_5377();

0 commit comments

Comments
 (0)