Skip to content

Commit c8e4499

Browse files
committed
<regex>: Add multiline option and make non-multiline mode the default
1 parent 41ec195 commit c8e4499

File tree

9 files changed

+376
-72
lines changed

9 files changed

+376
-72
lines changed

benchmarks/src/regex_search.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ void bm_lorem_search(benchmark::State& state, const char* pattern) {
3131
}
3232
}
3333

34+
BENCHMARK_CAPTURE(bm_lorem_search, "^bibe", "^bibe")->Arg(2)->Arg(3)->Arg(4);
3435
BENCHMARK_CAPTURE(bm_lorem_search, "bibe", "bibe")->Arg(2)->Arg(3)->Arg(4);
3536
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)", "(bibe)")->Arg(2)->Arg(3)->Arg(4);
3637
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)+", "(bibe)+")->Arg(2)->Arg(3)->Arg(4);

stl/inc/regex

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,11 @@ namespace regex_constants {
121121
_Gmask = 0x3F,
122122
_Any_posix = basic | extended | grep | egrep | awk,
123123

124-
icase = 0x0100,
125-
nosubs = 0x0200,
126-
optimize = 0x0400,
127-
collate = 0x0800
124+
icase = 0x0100,
125+
nosubs = 0x0200,
126+
optimize = 0x0400,
127+
collate = 0x0800,
128+
multiline = 0x1000
128129
};
129130

130131
_BITMASK_OPS(_EXPORT_STD, syntax_option_type)
@@ -1666,6 +1667,15 @@ public:
16661667
if (_Re->_Flags & _Fl_begin_needs_d) {
16671668
_Char_class_d = _Lookup_char_class(static_cast<_Elem>('D'));
16681669
}
1670+
1671+
// sanitize multiline mode setting
1672+
#ifdef _REGEX_MAKE_MULTILINE_MODE_DEFAULT
1673+
_Sflags |= regex_constants::multiline; // old matcher applied multiline mode for all grammars
1674+
#else // ^^^ defined(_REGEX_MAKE_MULTILINE_MODE_DEFAULT) / !defined(_REGEX_MAKE_MULTILINE_MODE_DEFAULT) vvv
1675+
if (_Sflags & regex_constants::_Any_posix) { // multiline mode is ECMAScript-only
1676+
_Sflags &= ~regex_constants::multiline;
1677+
}
1678+
#endif // ^^^ !defined(_REGEX_MAKE_MULTILINE_MODE_DEFAULT) ^^^
16691679
}
16701680

16711681
void _Setf(regex_constants::match_flag_type _Mf) { // set specified flags
@@ -1920,6 +1930,7 @@ public:
19201930
static constexpr flag_type awk = regex_constants::awk;
19211931
static constexpr flag_type grep = regex_constants::grep;
19221932
static constexpr flag_type egrep = regex_constants::egrep;
1933+
static constexpr flag_type multiline = regex_constants::multiline;
19231934

19241935
basic_regex() = default; // construct empty object
19251936

@@ -3833,6 +3844,11 @@ typename _RxTraits::char_class_type _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Al
38333844
return _Traits.lookup_classname(_Ptr, _Ptr + 1, (_Sflags & regex_constants::icase) != 0);
38343845
}
38353846

3847+
template <class _Elem>
3848+
bool _Is_ecmascript_line_terminator(_Elem _Ch) {
3849+
return _Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps;
3850+
}
3851+
38363852
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
38373853
bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _Nx) { // check for match
38383854
if (0 < _Max_stack_count && --_Max_stack_count <= 0) {
@@ -3852,18 +3868,19 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
38523868
case _N_bol:
38533869
if ((_Mflags & regex_constants::match_prev_avail)
38543870
|| _Tgt_state._Cur != _Begin) { // if --_Cur is valid, check for preceding newline
3855-
_Failed = *_Prev_iter(_Tgt_state._Cur) != _Meta_nl;
3871+
_Failed = !(_Sflags & regex_constants::multiline)
3872+
|| !_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_Tgt_state._Cur));
38563873
} else {
38573874
_Failed = (_Mflags & regex_constants::match_not_bol) != 0;
38583875
}
3859-
38603876
break;
38613877

38623878
case _N_eol:
38633879
if (_Tgt_state._Cur == _End) {
38643880
_Failed = (_Mflags & regex_constants::match_not_eol) != 0;
38653881
} else {
3866-
_Failed = *_Tgt_state._Cur != _Meta_nl;
3882+
_Failed =
3883+
!(_Sflags & regex_constants::multiline) || !_STD _Is_ecmascript_line_terminator(*_Tgt_state._Cur);
38673884
}
38683885

38693886
break;
@@ -3881,7 +3898,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
38813898
if (_Ch == _Elem()) {
38823899
_Failed = true;
38833900
}
3884-
} else if (_Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps) { // ECMAScript
3901+
} else if (_STD _Is_ecmascript_line_terminator(_Ch)) {
38853902
_Failed = true;
38863903
}
38873904

@@ -4054,30 +4071,55 @@ template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
40544071
_BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg) {
40554072
// skip until possible match
40564073
// assumes --_First_arg is valid
4057-
_Node_base* _Nx = _Node_arg ? _Node_arg : _Rep;
4074+
constexpr char _Line_terminators_char[] = {static_cast<char>(_Meta_cr), static_cast<char>(_Meta_nl)};
4075+
constexpr wchar_t _Line_terminators_wchar_t[] = {static_cast<wchar_t>(_Meta_cr), static_cast<wchar_t>(_Meta_nl),
4076+
static_cast<wchar_t>(_Meta_ls), static_cast<wchar_t>(_Meta_ps)};
4077+
_Node_base* _Nx = _Node_arg ? _Node_arg : _Rep;
40584078

40594079
while (_First_arg != _Last && _Nx) { // check current node
40604080
switch (_Nx->_Kind) { // handle current node's type
40614081
case _N_nop:
40624082
break;
40634083

4064-
case _N_bol:
4065-
{ // check for embedded newline
4066-
// return iterator to character just after the newline; for input like "\nabc"
4067-
// matching "^abc", _First_arg could be pointing at 'a', so we need to check
4068-
// --_First_arg for '\n'
4069-
if (*_Prev_iter(_First_arg) != _Meta_nl) {
4070-
_First_arg = _STD find(_First_arg, _Last, _Meta_nl);
4084+
case _N_bol: // check for beginning anchor
4085+
if (_Sflags & regex_constants::multiline) {
4086+
// multiline mode: check for embedded line terminator
4087+
// return iterator to character just after the newline; for input like "\nabc"
4088+
// matching "^abc", _First_arg could be pointing at 'a', so we need to check
4089+
// --_First_arg for '\n'
4090+
if (!_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_First_arg))) {
4091+
if constexpr (sizeof(_Elem) == 1) {
4092+
_First_arg = _STD find_first_of(
4093+
_First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char));
4094+
} else {
4095+
_First_arg = _STD find_first_of(
4096+
_First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t));
4097+
}
4098+
40714099
if (_First_arg != _Last) {
40724100
++_First_arg;
40734101
}
40744102
}
40754103

40764104
return _First_arg;
4105+
} else {
4106+
// non-multiline mode: never matches because --_First_arg is valid
4107+
return _Last;
40774108
}
40784109

40794110
case _N_eol:
4080-
return _STD find(_First_arg, _Last, _Meta_nl);
4111+
if (_Sflags & regex_constants::multiline) {
4112+
// multiline mode: matches at next line terminator or end of input
4113+
if constexpr (sizeof(_Elem) == 1) {
4114+
return _STD find_first_of(
4115+
_First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char));
4116+
} else {
4117+
return _STD find_first_of(
4118+
_First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t));
4119+
}
4120+
} else {
4121+
return _Last; // non-multiline mode: matches at end of input or not at all
4122+
}
40814123

40824124
case _N_str:
40834125
{ // check for string match

tests/libcxx/expected_results.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -575,12 +575,6 @@ std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp FAIL
575575

576576

577577
# *** MISSING LWG ISSUE RESOLUTIONS ***
578-
# LWG-2503 "multiline option should be added to syntax_option_type"
579-
std/re/re.alg/re.alg.search/no_update_pos.pass.cpp FAIL
580-
std/re/re.const/re.matchflag/match_multiline.pass.cpp FAIL
581-
std/re/re.const/re.matchflag/match_not_eol.pass.cpp FAIL
582-
std/re/re.const/re.synopt/syntax_option_type.pass.cpp FAIL
583-
584578
# LWG-2532 "Satisfying a promise at thread exit" (Open)
585579
std/thread/futures/futures.promise/set_exception_at_thread_exit.pass.cpp FAIL
586580
std/thread/futures/futures.promise/set_lvalue_at_thread_exit.pass.cpp FAIL

tests/std/test.lst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ tests\Dev11_1140665_unique_ptr_array_conversions
154154
tests\Dev11_1150223_shared_mutex
155155
tests\Dev11_1158803_regex_thread_safety
156156
tests\Dev11_1180290_filesystem_error_code
157+
tests\GH_000073_regex_multiline_escape_hatch
157158
tests\GH_000140_adl_proof_comparison
158159
tests\GH_000140_adl_proof_construction
159160
tests\GH_000140_adl_proof_views
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
RUNALL_INCLUDE ..\usual_matrix.lst
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
3+
4+
#define _REGEX_MAKE_MULTILINE_MODE_DEFAULT
5+
6+
#include <cstddef>
7+
#include <cstdio>
8+
#include <regex>
9+
#include <string>
10+
11+
#include <test_regex_support.hpp>
12+
13+
using namespace std;
14+
using namespace std::regex_constants;
15+
16+
regex_fixture g_regexTester;
17+
18+
void test_VSO_225160_match_bol_flag() {
19+
// Old tests for caret anchor in default multiline mode
20+
for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) {
21+
const test_regex emptyAnchor(&g_regexTester, R"(^)", syntax);
22+
emptyAnchor.should_search_match("", "");
23+
emptyAnchor.should_search_fail("", match_not_bol);
24+
emptyAnchor.should_search_match("\n", "");
25+
emptyAnchor.should_search_match("\n", "", match_not_bol);
26+
27+
const test_regex beginCd(&g_regexTester, R"(^cd)", syntax);
28+
beginCd.should_search_match("ab\ncdefg", "cd");
29+
beginCd.should_search_match("ab\ncdefg", "cd", match_not_bol);
30+
31+
beginCd.should_search_match("cdefg", "cd");
32+
beginCd.should_search_fail("cdefg", match_not_bol);
33+
beginCd.should_search_match("\ncdefg", "cd");
34+
beginCd.should_search_match("\ncdefg", "cd", match_not_bol);
35+
36+
beginCd.should_search_fail("ab\nxcdefg");
37+
beginCd.should_search_fail("ab\nxcdefg", match_not_bol);
38+
}
39+
}
40+
41+
42+
void test_VSO_225160_match_eol_flag() {
43+
// Old tests for dollar anchor in default multiline mode
44+
for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) {
45+
const test_regex emptyAnchor(&g_regexTester, R"($)", syntax);
46+
emptyAnchor.should_search_match("", "");
47+
emptyAnchor.should_search_fail("", match_not_eol);
48+
emptyAnchor.should_search_match("\n", "");
49+
emptyAnchor.should_search_match("\n", "", match_not_eol);
50+
51+
const test_regex cdEnd(&g_regexTester, R"(cd$)", syntax);
52+
cdEnd.should_search_match("abcd\nefg", "cd");
53+
cdEnd.should_search_match("abcd\nefg", "cd", match_not_eol);
54+
55+
cdEnd.should_search_match("abcd", "cd");
56+
cdEnd.should_search_fail("abcd", match_not_eol);
57+
cdEnd.should_search_match("abcd\n", "cd");
58+
cdEnd.should_search_match("abcd\n", "cd", match_not_eol);
59+
60+
cdEnd.should_search_fail("abcdx\nefg");
61+
cdEnd.should_search_fail("abcdx\nefg", match_not_eol);
62+
}
63+
}
64+
65+
void test_gh_73() {
66+
for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) {
67+
{
68+
test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", syntax);
69+
a_anchored_on_both_sides.should_search_match("a", "a");
70+
a_anchored_on_both_sides.should_search_match("b\na", "a");
71+
a_anchored_on_both_sides.should_search_match("a\nb", "a");
72+
a_anchored_on_both_sides.should_search_fail("a\nb", match_not_bol);
73+
a_anchored_on_both_sides.should_search_fail("b\na", match_not_eol);
74+
}
75+
76+
{
77+
test_regex a_anchored_front(&g_regexTester, "^a", syntax);
78+
a_anchored_front.should_search_match("a", "a");
79+
a_anchored_front.should_search_match("a\n", "a");
80+
a_anchored_front.should_search_match("a\nb", "a");
81+
a_anchored_front.should_search_match("b\na", "a");
82+
a_anchored_front.should_search_match("\na", "a");
83+
a_anchored_front.should_search_fail("a", match_not_bol);
84+
a_anchored_front.should_search_match("\na", "a", match_not_bol);
85+
a_anchored_front.should_search_match("b\na", "a", match_not_bol);
86+
}
87+
88+
{
89+
test_regex a_anchored_back(&g_regexTester, "a$", syntax);
90+
a_anchored_back.should_search_match("a", "a");
91+
a_anchored_back.should_search_match("\na", "a");
92+
a_anchored_back.should_search_match("b\na", "a");
93+
a_anchored_back.should_search_match("a\nb", "a");
94+
a_anchored_back.should_search_match("a\n", "a");
95+
a_anchored_back.should_search_fail("a", match_not_eol);
96+
a_anchored_back.should_search_match("a\n", "a", match_not_eol);
97+
a_anchored_back.should_search_match("a\nb", "a", match_not_eol);
98+
}
99+
}
100+
}
101+
102+
int main() {
103+
test_VSO_225160_match_bol_flag();
104+
test_VSO_225160_match_eol_flag();
105+
test_gh_73();
106+
107+
return g_regexTester.result();
108+
}

tests/std/tests/VSO_0000000_regex_interface/test.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,9 +387,9 @@ void test_VSO_180466_regex_search_missing_Unchecked_call() {
387387
}
388388

389389
void test_VSO_226914_match_prev_avail() {
390-
// N.B. assumes our nonstandard multiline behavior. See also: LWG-2343, LWG-2503
390+
// test assumes multiline mode
391391
const char bol_haystack[] = {'\n', 'a'};
392-
const regex bol_anchor(R"(^a)");
392+
const regex bol_anchor(R"(^a)", regex_constants::multiline);
393393
assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor));
394394
assert(!regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_not_bol));
395395
assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_prev_avail));

0 commit comments

Comments
 (0)