Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/src/regex_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ void bm_lorem_search(benchmark::State& state, const char* pattern) {
}
}

BENCHMARK_CAPTURE(bm_lorem_search, "^bibe", "^bibe")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, "bibe", "bibe")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)", "(bibe)")->Arg(2)->Arg(3)->Arg(4);
BENCHMARK_CAPTURE(bm_lorem_search, "(bibe)+", "(bibe)+")->Arg(2)->Arg(3)->Arg(4);
Expand Down
86 changes: 69 additions & 17 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@ _STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new

// Controls whether LWG-2503 "multiline option should be added to syntax_option_type" is implemented.
// Defining this to 0 requests Standard behavior:
// * For ECMAScript, matching is non-multiline by default, but regex_constants::multiline can be requested.
// * For POSIX grammars, matching is non-multiline, and regex_constants::multiline is ignored (N5008 [tab:re.synopt]).
// Defining this to 1 requests legacy behavior:
// * For all grammars, matching is multiline, and regex_constants::multiline is redundant.
#ifndef _REGEX_LEGACY_MULTILINE_MODE
#define _REGEX_LEGACY_MULTILINE_MODE 0
#endif

#ifndef _REGEX_MAX_COMPLEXITY_COUNT
#define _REGEX_MAX_COMPLEXITY_COUNT 10000000L // set to 0 to disable
#endif // !defined(_REGEX_MAX_COMPLEXITY_COUNT)
Expand Down Expand Up @@ -121,10 +131,11 @@ namespace regex_constants {
_Gmask = 0x3F,
_Any_posix = basic | extended | grep | egrep | awk,

icase = 0x0100,
nosubs = 0x0200,
optimize = 0x0400,
collate = 0x0800
icase = 0x0100,
nosubs = 0x0200,
optimize = 0x0400,
collate = 0x0800,
multiline = 0x1000
};

_BITMASK_OPS(_EXPORT_STD, syntax_option_type)
Expand Down Expand Up @@ -1666,6 +1677,15 @@ public:
if (_Re->_Flags & _Fl_begin_needs_d) {
_Char_class_d = _Lookup_char_class(static_cast<_Elem>('D'));
}

// sanitize multiline mode setting
#if _REGEX_LEGACY_MULTILINE_MODE
_Sflags |= regex_constants::multiline; // old matcher applied multiline mode for all grammars
#else // ^^^ _REGEX_LEGACY_MULTILINE_MODE / !_REGEX_LEGACY_MULTILINE_MODE vvv
if (_Sflags & regex_constants::_Any_posix) { // multiline mode is ECMAScript-only
_Sflags &= ~regex_constants::multiline;
}
#endif // ^^^ !_REGEX_LEGACY_MULTILINE_MODE ^^^
}

void _Setf(regex_constants::match_flag_type _Mf) { // set specified flags
Expand Down Expand Up @@ -1920,6 +1940,7 @@ public:
static constexpr flag_type awk = regex_constants::awk;
static constexpr flag_type grep = regex_constants::grep;
static constexpr flag_type egrep = regex_constants::egrep;
static constexpr flag_type multiline = regex_constants::multiline;

basic_regex() = default; // construct empty object

Expand Down Expand Up @@ -3833,6 +3854,11 @@ typename _RxTraits::char_class_type _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Al
return _Traits.lookup_classname(_Ptr, _Ptr + 1, (_Sflags & regex_constants::icase) != 0);
}

template <class _Elem>
bool _Is_ecmascript_line_terminator(_Elem _Ch) {
return _Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps;
}

template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _Nx) { // check for match
if (0 < _Max_stack_count && --_Max_stack_count <= 0) {
Expand All @@ -3852,18 +3878,19 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
case _N_bol:
if ((_Mflags & regex_constants::match_prev_avail)
|| _Tgt_state._Cur != _Begin) { // if --_Cur is valid, check for preceding newline
_Failed = *_Prev_iter(_Tgt_state._Cur) != _Meta_nl;
_Failed = !(_Sflags & regex_constants::multiline)
|| !_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_Tgt_state._Cur));
} else {
_Failed = (_Mflags & regex_constants::match_not_bol) != 0;
}

break;

case _N_eol:
if (_Tgt_state._Cur == _End) {
_Failed = (_Mflags & regex_constants::match_not_eol) != 0;
} else {
_Failed = *_Tgt_state._Cur != _Meta_nl;
_Failed =
!(_Sflags & regex_constants::multiline) || !_STD _Is_ecmascript_line_terminator(*_Tgt_state._Cur);
}

break;
Expand All @@ -3881,7 +3908,7 @@ bool _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
if (_Ch == _Elem()) {
_Failed = true;
}
} else if (_Ch == _Meta_nl || _Ch == _Meta_cr || _Ch == _Meta_ls || _Ch == _Meta_ps) { // ECMAScript
} else if (_STD _Is_ecmascript_line_terminator(_Ch)) {
_Failed = true;
}

Expand Down Expand Up @@ -4054,30 +4081,55 @@ template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
_BidIt _Matcher2<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Skip(_BidIt _First_arg, _BidIt _Last, _Node_base* _Node_arg) {
// skip until possible match
// assumes --_First_arg is valid
_Node_base* _Nx = _Node_arg ? _Node_arg : _Rep;
static constexpr char _Line_terminators_char[] = {static_cast<char>(_Meta_cr), static_cast<char>(_Meta_nl)};
static constexpr wchar_t _Line_terminators_wchar_t[] = {static_cast<wchar_t>(_Meta_cr),
static_cast<wchar_t>(_Meta_nl), static_cast<wchar_t>(_Meta_ls), static_cast<wchar_t>(_Meta_ps)};
_Node_base* _Nx = _Node_arg ? _Node_arg : _Rep;

while (_First_arg != _Last && _Nx) { // check current node
switch (_Nx->_Kind) { // handle current node's type
case _N_nop:
break;

case _N_bol:
{ // check for embedded newline
// return iterator to character just after the newline; for input like "\nabc"
// matching "^abc", _First_arg could be pointing at 'a', so we need to check
// --_First_arg for '\n'
if (*_Prev_iter(_First_arg) != _Meta_nl) {
_First_arg = _STD find(_First_arg, _Last, _Meta_nl);
case _N_bol: // check for beginning anchor
if (_Sflags & regex_constants::multiline) {
// multiline mode: check for embedded line terminator
// return iterator to character just after the newline; for input like "\nabc"
// matching "^abc", _First_arg could be pointing at 'a', so we need to check
// --_First_arg for '\n'
if (!_STD _Is_ecmascript_line_terminator(*_STD _Prev_iter(_First_arg))) {
if constexpr (sizeof(_Elem) == 1) {
_First_arg = _STD find_first_of(
_First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char));
} else {
_First_arg = _STD find_first_of(
_First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t));
}

if (_First_arg != _Last) {
++_First_arg;
}
}

return _First_arg;
} else {
// non-multiline mode: never matches because --_First_arg is valid
return _Last;
}

case _N_eol:
return _STD find(_First_arg, _Last, _Meta_nl);
if (_Sflags & regex_constants::multiline) {
// multiline mode: matches at next line terminator or end of input
if constexpr (sizeof(_Elem) == 1) {
return _STD find_first_of(
_First_arg, _Last, _Line_terminators_char, _STD end(_Line_terminators_char));
} else {
return _STD find_first_of(
_First_arg, _Last, _Line_terminators_wchar_t, _STD end(_Line_terminators_wchar_t));
}
} else {
return _Last; // non-multiline mode: matches at end of input or not at all
}

case _N_str:
{ // check for string match
Expand Down
6 changes: 0 additions & 6 deletions tests/libcxx/expected_results.txt
Original file line number Diff line number Diff line change
Expand Up @@ -575,12 +575,6 @@ std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp FAIL


# *** MISSING LWG ISSUE RESOLUTIONS ***
# LWG-2503 "multiline option should be added to syntax_option_type"
std/re/re.alg/re.alg.search/no_update_pos.pass.cpp FAIL
std/re/re.const/re.matchflag/match_multiline.pass.cpp FAIL
std/re/re.const/re.matchflag/match_not_eol.pass.cpp FAIL
std/re/re.const/re.synopt/syntax_option_type.pass.cpp FAIL

# LWG-2532 "Satisfying a promise at thread exit" (Open)
std/thread/futures/futures.promise/set_exception_at_thread_exit.pass.cpp FAIL
std/thread/futures/futures.promise/set_lvalue_at_thread_exit.pass.cpp FAIL
Expand Down
1 change: 1 addition & 0 deletions tests/std/test.lst
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ tests\Dev11_1140665_unique_ptr_array_conversions
tests\Dev11_1150223_shared_mutex
tests\Dev11_1158803_regex_thread_safety
tests\Dev11_1180290_filesystem_error_code
tests\GH_000073_regex_multiline_escape_hatch
tests\GH_000140_adl_proof_comparison
tests\GH_000140_adl_proof_construction
tests\GH_000140_adl_proof_views
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

RUNALL_INCLUDE ..\usual_matrix.lst
107 changes: 107 additions & 0 deletions tests/std/tests/GH_000073_regex_multiline_escape_hatch/test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#define _REGEX_LEGACY_MULTILINE_MODE 1

#include <cstddef>
#include <cstdio>
#include <regex>
#include <string>

#include <test_regex_support.hpp>

using namespace std;
using namespace std::regex_constants;

regex_fixture g_regexTester;

void test_VSO_225160_match_bol_flag() {
// Old tests for caret anchor in default multiline mode
for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) {
const test_regex emptyAnchor(&g_regexTester, R"(^)", syntax);
emptyAnchor.should_search_match("", "");
emptyAnchor.should_search_fail("", match_not_bol);
emptyAnchor.should_search_match("\n", "");
emptyAnchor.should_search_match("\n", "", match_not_bol);

const test_regex beginCd(&g_regexTester, R"(^cd)", syntax);
beginCd.should_search_match("ab\ncdefg", "cd");
beginCd.should_search_match("ab\ncdefg", "cd", match_not_bol);

beginCd.should_search_match("cdefg", "cd");
beginCd.should_search_fail("cdefg", match_not_bol);
beginCd.should_search_match("\ncdefg", "cd");
beginCd.should_search_match("\ncdefg", "cd", match_not_bol);

beginCd.should_search_fail("ab\nxcdefg");
beginCd.should_search_fail("ab\nxcdefg", match_not_bol);
}
}

void test_VSO_225160_match_eol_flag() {
// Old tests for dollar anchor in default multiline mode
for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) {
const test_regex emptyAnchor(&g_regexTester, R"($)", syntax);
emptyAnchor.should_search_match("", "");
emptyAnchor.should_search_fail("", match_not_eol);
emptyAnchor.should_search_match("\n", "");
emptyAnchor.should_search_match("\n", "", match_not_eol);

const test_regex cdEnd(&g_regexTester, R"(cd$)", syntax);
cdEnd.should_search_match("abcd\nefg", "cd");
cdEnd.should_search_match("abcd\nefg", "cd", match_not_eol);

cdEnd.should_search_match("abcd", "cd");
cdEnd.should_search_fail("abcd", match_not_eol);
cdEnd.should_search_match("abcd\n", "cd");
cdEnd.should_search_match("abcd\n", "cd", match_not_eol);

cdEnd.should_search_fail("abcdx\nefg");
cdEnd.should_search_fail("abcdx\nefg", match_not_eol);
}
}

void test_gh_73() {
for (syntax_option_type syntax : {syntax_option_type{}, ECMAScript, basic, grep, extended, egrep, awk}) {
{
test_regex a_anchored_on_both_sides(&g_regexTester, "^a$", syntax);
a_anchored_on_both_sides.should_search_match("a", "a");
a_anchored_on_both_sides.should_search_match("b\na", "a");
a_anchored_on_both_sides.should_search_match("a\nb", "a");
a_anchored_on_both_sides.should_search_fail("a\nb", match_not_bol);
a_anchored_on_both_sides.should_search_fail("b\na", match_not_eol);
}

{
test_regex a_anchored_front(&g_regexTester, "^a", syntax);
a_anchored_front.should_search_match("a", "a");
a_anchored_front.should_search_match("a\n", "a");
a_anchored_front.should_search_match("a\nb", "a");
a_anchored_front.should_search_match("b\na", "a");
a_anchored_front.should_search_match("\na", "a");
a_anchored_front.should_search_fail("a", match_not_bol);
a_anchored_front.should_search_match("\na", "a", match_not_bol);
a_anchored_front.should_search_match("b\na", "a", match_not_bol);
}

{
test_regex a_anchored_back(&g_regexTester, "a$", syntax);
a_anchored_back.should_search_match("a", "a");
a_anchored_back.should_search_match("\na", "a");
a_anchored_back.should_search_match("b\na", "a");
a_anchored_back.should_search_match("a\nb", "a");
a_anchored_back.should_search_match("a\n", "a");
a_anchored_back.should_search_fail("a", match_not_eol);
a_anchored_back.should_search_match("a\n", "a", match_not_eol);
a_anchored_back.should_search_match("a\nb", "a", match_not_eol);
}
}
}

int main() {
test_VSO_225160_match_bol_flag();
test_VSO_225160_match_eol_flag();
test_gh_73();

return g_regexTester.result();
}
4 changes: 2 additions & 2 deletions tests/std/tests/VSO_0000000_regex_interface/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -387,9 +387,9 @@ void test_VSO_180466_regex_search_missing_Unchecked_call() {
}

void test_VSO_226914_match_prev_avail() {
// N.B. assumes our nonstandard multiline behavior. See also: LWG-2343, LWG-2503
// test exercises multiline mode
const char bol_haystack[] = {'\n', 'a'};
const regex bol_anchor(R"(^a)");
const regex bol_anchor(R"(^a)", regex_constants::multiline);
assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor));
assert(!regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_not_bol));
assert(regex_match(bol_haystack + 1, end(bol_haystack), bol_anchor, match_prev_avail));
Expand Down
Loading