Skip to content

Commit ccffc0c

Browse files
<regex>: Add multiline option and make non-multiline mode the default (#5535)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 7381f2f commit ccffc0c

File tree

9 files changed

+385
-70
lines changed

9 files changed

+385
-70
lines changed

benchmarks/src/regex_search.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ void bm_lorem_search(benchmark::State& state, const char* pattern) {
3131
}
3232
}
3333

34+
BENCHMARK_CAPTURE(bm_lorem_search, "^bibe", "^bibe")->Arg(2)->Arg(3)->Arg(4);
3435
BENCHMARK_CAPTURE(bm_lorem_search, "bibe", "bibe")->Arg(2)->Arg(3)->Arg(4);
3536
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)", "(bibe)")->Arg(2)->Arg(3)->Arg(4);
3637
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)+", "(bibe)+")->Arg(2)->Arg(3)->Arg(4);

stl/inc/regex

Lines changed: 69 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,16 @@ _STL_DISABLE_CLANG_WARNINGS
3333
#pragma push_macro("new")
3434
#undef new
3535

36+
// Controls whether LWG-2503 "multiline option should be added to syntax_option_type" is implemented.
37+
// Defining this to 0 requests Standard behavior:
38+
// * For ECMAScript, matching is non-multiline by default, but regex_constants::multiline can be requested.
39+
// * For POSIX grammars, matching is non-multiline, and regex_constants::multiline is ignored (N5008 [tab:re.synopt]).
40+
// Defining this to 1 requests legacy behavior:
41+
// * For all grammars, matching is multiline, and regex_constants::multiline is redundant.
42+
#ifndef _REGEX_LEGACY_MULTILINE_MODE
43+
#define _REGEX_LEGACY_MULTILINE_MODE 0
44+
#endif
45+
3646
#ifndef _REGEX_MAX_COMPLEXITY_COUNT
3747
#define _REGEX_MAX_COMPLEXITY_COUNT 10000000L // set to 0 to disable
3848
#endif // !defined(_REGEX_MAX_COMPLEXITY_COUNT)
@@ -121,10 +131,11 @@ namespace regex_constants {
121131
_Gmask = 0x3F,
122132
_Any_posix = basic | extended | grep | egrep | awk,
123133

124-
icase = 0x0100,
125-
nosubs = 0x0200,
126-
optimize = 0x0400,
127-
collate = 0x0800
134+
icase = 0x0100,
135+
nosubs = 0x0200,
136+
optimize = 0x0400,
137+
collate = 0x0800,
138+
multiline = 0x1000
128139
};
129140

130141
_BITMASK_OPS(_EXPORT_STD, syntax_option_type)
@@ -1666,6 +1677,15 @@ public:
16661677
if (_Re->_Flags & _Fl_begin_needs_d) {
16671678
_Char_class_d = _Lookup_char_class(static_cast<_Elem>('D'));
16681679
}
1680+
1681+
// sanitize multiline mode setting
1682+
#if _REGEX_LEGACY_MULTILINE_MODE
1683+
_Sflags |= regex_constants::multiline; // old matcher applied multiline mode for all grammars
1684+
#else // ^^^ _REGEX_LEGACY_MULTILINE_MODE / !_REGEX_LEGACY_MULTILINE_MODE vvv
1685+
if (_Sflags & regex_constants::_Any_posix) { // multiline mode is ECMAScript-only
1686+
_Sflags &= ~regex_constants::multiline;
1687+
}
1688+
#endif // ^^^ !_REGEX_LEGACY_MULTILINE_MODE ^^^
16691689
}
16701690

16711691
void _Setf(regex_constants::match_flag_type _Mf) { // set specified flags
@@ -1920,6 +1940,7 @@ public:
19201940
static constexpr flag_type awk = regex_constants::awk;
19211941
static constexpr flag_type grep = regex_constants::grep;
19221942
static constexpr flag_type egrep = regex_constants::egrep;
1943+
static constexpr flag_type multiline = regex_constants::multiline;
19231944

19241945
basic_regex() = default; // construct empty object
19251946

@@ -3833,6 +3854,11 @@ typename _RxTraits::char_class_type _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Al
38333854
return _Traits.lookup_classname(_Ptr, _Ptr + 1, (_Sflags & regex_constants::icase) != 0);
38343855
}
38353856

3857+
template <class _Elem>
3858+
bool _Is_ecmascript_line_terminator(_Elem _Ch) {
3859+
return _Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps;
3860+
}
3861+
38363862
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
38373863
bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _Nx) { // check for match
38383864
if (0 < _Max_stack_count && --_Max_stack_count <= 0) {
@@ -3852,18 +3878,19 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
38523878
case _N_bol:
38533879
if ((_Mflags & regex_constants::match_prev_avail)
38543880
|| _Tgt_state._Cur != _Begin) { // if --_Cur is valid, check for preceding newline
3855-
_Failed = *_Prev_iter(_Tgt_state._Cur) != _Meta_nl;
3881+
_Failed = !(_Sflags & regex_constants::multiline)
3882+
|| !_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_Tgt_state._Cur));
38563883
} else {
38573884
_Failed = (_Mflags & regex_constants::match_not_bol) != 0;
38583885
}
3859-
38603886
break;
38613887

38623888
case _N_eol:
38633889
if (_Tgt_state._Cur == _End) {
38643890
_Failed = (_Mflags & regex_constants::match_not_eol) != 0;
38653891
} else {
3866-
_Failed = *_Tgt_state._Cur != _Meta_nl;
3892+
_Failed =
3893+
!(_Sflags & regex_constants::multiline) || !_STD _Is_ecmascript_line_terminator(*_Tgt_state._Cur);
38673894
}
38683895

38693896
break;
@@ -3881,7 +3908,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
38813908
if (_Ch == _Elem()) {
38823909
_Failed = true;
38833910
}
3884-
} else if (_Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps) { // ECMAScript
3911+
} else if (_STD _Is_ecmascript_line_terminator(_Ch)) {
38853912
_Failed = true;
38863913
}
38873914

@@ -4054,30 +4081,55 @@ template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
40544081
_BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg) {
40554082
// skip until possible match
40564083
// assumes --_First_arg is valid
4057-
_Node_base* _Nx = _Node_arg ? _Node_arg : _Rep;
4084+
static constexpr char _Line_terminators_char[] = {static_cast<char>(_Meta_cr), static_cast<char>(_Meta_nl)};
4085+
static constexpr wchar_t _Line_terminators_wchar_t[] = {static_cast<wchar_t>(_Meta_cr),
4086+
static_cast<wchar_t>(_Meta_nl), static_cast<wchar_t>(_Meta_ls), static_cast<wchar_t>(_Meta_ps)};
4087+
_Node_base* _Nx = _Node_arg ? _Node_arg : _Rep;
40584088

40594089
while (_First_arg != _Last && _Nx) { // check current node
40604090
switch (_Nx->_Kind) { // handle current node's type
40614091
case _N_nop:
40624092
break;
40634093

4064-
case _N_bol:
4065-
{ // check for embedded newline
4066-
// return iterator to character just after the newline; for input like "\nabc"
4067-
// matching "^abc", _First_arg could be pointing at 'a', so we need to check
4068-
// --_First_arg for '\n'
4069-
if (*_Prev_iter(_First_arg) != _Meta_nl) {
4070-
_First_arg = _STD find(_First_arg, _Last, _Meta_nl);
4094+
case _N_bol: // check for beginning anchor
4095+
if (_Sflags & regex_constants::multiline) {
4096+
// multiline mode: check for embedded line terminator
4097+
// return iterator to character just after the newline; for input like "\nabc"
4098+
// matching "^abc", _First_arg could be pointing at 'a', so we need to check
4099+
// --_First_arg for '\n'
4100+
if (!_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_First_arg))) {
4101+
if constexpr (sizeof(_Elem) == 1) {
4102+
_First_arg = _STD find_first_of(
4103+
_First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char));
4104+
} else {
4105+
_First_arg = _STD find_first_of(
4106+
_First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t));
4107+
}
4108+
40714109
if (_First_arg != _Last) {
40724110
++_First_arg;
40734111
}
40744112
}
40754113

40764114
return _First_arg;
4115+
} else {
4116+
// non-multiline mode: never matches because --_First_arg is valid
4117+
return _Last;
40774118
}
40784119

40794120
case _N_eol:
4080-
return _STD find(_First_arg, _Last, _Meta_nl);
4121+
if (_Sflags & regex_constants::multiline) {
4122+
// multiline mode: matches at next line terminator or end of input
4123+
if constexpr (sizeof(_Elem) == 1) {
4124+
return _STD find_first_of(
4125+
_First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char));
4126+
} else {
4127+
return _STD find_first_of(
4128+
_First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t));
4129+
}
4130+
} else {
4131+
return _Last; // non-multiline mode: matches at end of input or not at all
4132+
}
40814133

40824134
case _N_str:
40834135
{ // check for string match

tests/libcxx/expected_results.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -575,12 +575,6 @@ std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp FAIL
575575

576576

577577
# *** MISSING LWG ISSUE RESOLUTIONS ***
578-
# LWG-2503 "multiline option should be added to syntax_option_type"
579-
std/re/re.alg/re.alg.search/no_update_pos.pass.cpp FAIL
580-
std/re/re.const/re.matchflag/match_multiline.pass.cpp FAIL
581-
std/re/re.const/re.matchflag/match_not_eol.pass.cpp FAIL
582-
std/re/re.const/re.synopt/syntax_option_type.pass.cpp FAIL
583-
584578
# LWG-2532 "Satisfying a promise at thread exit" (Open)
585579
std/thread/futures/futures.promise/set_exception_at_thread_exit.pass.cpp FAIL
586580
std/thread/futures/futures.promise/set_lvalue_at_thread_exit.pass.cpp FAIL

tests/std/test.lst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ tests\Dev11_1140665_unique_ptr_array_conversions
154154
tests\Dev11_1150223_shared_mutex
155155
tests\Dev11_1158803_regex_thread_safety
156156
tests\Dev11_1180290_filesystem_error_code
157+
tests\GH_000073_regex_multiline_escape_hatch
157158
tests\GH_000140_adl_proof_comparison
158159
tests\GH_000140_adl_proof_construction
159160
tests\GH_000140_adl_proof_views
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
RUNALL_INCLUDE ..\usual_matrix.lst
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
#define _REGEX_LEGACY_MULTILINE_MODE 1
5+
6+
#include <cstddef>
7+
#include <cstdio>
8+
#include <regex>
9+
#include <string>
10+
11+
#include <test_regex_support.hpp>
12+
13+
using namespace std;
14+
using namespace std::regex_constants;
15+
16+
regex_fixture g_regexTester;
17+
18+
void test_VSO_225160_match_bol_flag() {
19+
// Old tests for caret anchor in default multiline mode
20+
for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) {
21+
const test_regex emptyAnchor(&g_regexTester, R"(^)", syntax);
22+
emptyAnchor.should_search_match("", "");
23+
emptyAnchor.should_search_fail("", match_not_bol);
24+
emptyAnchor.should_search_match("\n", "");
25+
emptyAnchor.should_search_match("\n", "", match_not_bol);
26+
27+
const test_regex beginCd(&g_regexTester, R"(^cd)", syntax);
28+
beginCd.should_search_match("ab\ncdefg", "cd");
29+
beginCd.should_search_match("ab\ncdefg", "cd", match_not_bol);
30+
31+
beginCd.should_search_match("cdefg", "cd");
32+
beginCd.should_search_fail("cdefg", match_not_bol);
33+
beginCd.should_search_match("\ncdefg", "cd");
34+
beginCd.should_search_match("\ncdefg", "cd", match_not_bol);
35+
36+
beginCd.should_search_fail("ab\nxcdefg");
37+
beginCd.should_search_fail("ab\nxcdefg", match_not_bol);
38+
}
39+
}
40+
41+
void test_VSO_225160_match_eol_flag() {
42+
// Old tests for dollar anchor in default multiline mode
43+
for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) {
44+
const test_regex emptyAnchor(&g_regexTester, R"($)", syntax);
45+
emptyAnchor.should_search_match("", "");
46+
emptyAnchor.should_search_fail("", match_not_eol);
47+
emptyAnchor.should_search_match("\n", "");
48+
emptyAnchor.should_search_match("\n", "", match_not_eol);
49+
50+
const test_regex cdEnd(&g_regexTester, R"(cd$)", syntax);
51+
cdEnd.should_search_match("abcd\nefg", "cd");
52+
cdEnd.should_search_match("abcd\nefg", "cd", match_not_eol);
53+
54+
cdEnd.should_search_match("abcd", "cd");
55+
cdEnd.should_search_fail("abcd", match_not_eol);
56+
cdEnd.should_search_match("abcd\n", "cd");
57+
cdEnd.should_search_match("abcd\n", "cd", match_not_eol);
58+
59+
cdEnd.should_search_fail("abcdx\nefg");
60+
cdEnd.should_search_fail("abcdx\nefg", match_not_eol);
61+
}
62+
}
63+
64+
void test_gh_73() {
65+
for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) {
66+
{
67+
test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", syntax);
68+
a_anchored_on_both_sides.should_search_match("a", "a");
69+
a_anchored_on_both_sides.should_search_match("b\na", "a");
70+
a_anchored_on_both_sides.should_search_match("a\nb", "a");
71+
a_anchored_on_both_sides.should_search_fail("a\nb", match_not_bol);
72+
a_anchored_on_both_sides.should_search_fail("b\na", match_not_eol);
73+
}
74+
75+
{
76+
test_regex a_anchored_front(&g_regexTester, "^a", syntax);
77+
a_anchored_front.should_search_match("a", "a");
78+
a_anchored_front.should_search_match("a\n", "a");
79+
a_anchored_front.should_search_match("a\nb", "a");
80+
a_anchored_front.should_search_match("b\na", "a");
81+
a_anchored_front.should_search_match("\na", "a");
82+
a_anchored_front.should_search_fail("a", match_not_bol);
83+
a_anchored_front.should_search_match("\na", "a", match_not_bol);
84+
a_anchored_front.should_search_match("b\na", "a", match_not_bol);
85+
}
86+
87+
{
88+
test_regex a_anchored_back(&g_regexTester, "a$", syntax);
89+
a_anchored_back.should_search_match("a", "a");
90+
a_anchored_back.should_search_match("\na", "a");
91+
a_anchored_back.should_search_match("b\na", "a");
92+
a_anchored_back.should_search_match("a\nb", "a");
93+
a_anchored_back.should_search_match("a\n", "a");
94+
a_anchored_back.should_search_fail("a", match_not_eol);
95+
a_anchored_back.should_search_match("a\n", "a", match_not_eol);
96+
a_anchored_back.should_search_match("a\nb", "a", match_not_eol);
97+
}
98+
}
99+
}
100+
101+
int main() {
102+
test_VSO_225160_match_bol_flag();
103+
test_VSO_225160_match_eol_flag();
104+
test_gh_73();
105+
106+
return g_regexTester.result();
107+
}

tests/std/tests/VSO_0000000_regex_interface/test.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,9 +387,9 @@ void test_VSO_180466_regex_search_missing_Unchecked_call() {
387387
}
388388

389389
void test_VSO_226914_match_prev_avail() {
390-
// N.B. assumes our nonstandard multiline behavior. See also: LWG-2343, LWG-2503
390+
// test exercises multiline mode
391391
const char bol_haystack[] = {'\n', 'a'};
392-
const regex bol_anchor(R"(^a)");
392+
const regex bol_anchor(R"(^a)", regex_constants::multiline);
393393
assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor));
394394
assert(!regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_not_bol));
395395
assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_prev_avail));

0 commit comments

Comments
 (0)