Skip to content

Commit 13f1f16

Browse files
<regex>: Properly parse dollar anchors in basic and grep mode (#5362)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 6d25346 commit 13f1f16

File tree

2 files changed

+217
-9
lines changed

2 files changed

+217
-9
lines changed

stl/inc/regex

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1721,7 +1721,7 @@ private:
17211721
// lexing
17221722
[[noreturn]] void _Error(regex_constants::error_type);
17231723

1724-
bool _Is_esc() const;
1724+
bool _Is_esc(_FwdIt) const;
17251725
void _Trans();
17261726
void _Next();
17271727
void _Expect(_Meta_type, regex_constants::error_type);
@@ -3875,8 +3875,7 @@ template <class _FwdIt, class _Elem, class _RxTraits>
38753875
}
38763876

38773877
template <class _FwdIt, class _Elem, class _RxTraits>
3878-
bool _Parser<_FwdIt, _Elem, _RxTraits>::_Is_esc() const { // assumes _Pat != _End
3879-
_FwdIt _Ch0 = _Pat;
3878+
bool _Parser<_FwdIt, _Elem, _RxTraits>::_Is_esc(_FwdIt _Ch0) const { // assumes _Ch0 != _End
38803879
return ++_Ch0 != _End
38813880
&& ((!(_L_flags & _L_nex_grp) && (*_Ch0 == _Meta_lpar || *_Ch0 == _Meta_rpar))
38823881
|| (!(_L_flags & _L_nex_rep) && (*_Ch0 == _Meta_lbr || *_Ch0 == _Meta_rbr)));
@@ -3897,7 +3896,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-char
38973896
}
38983897
switch (_Char) { // handle special cases
38993898
case _Meta_esc:
3900-
if (_Is_esc()) { // replace escape sequence
3899+
if (_Is_esc(_Pat)) { // replace escape sequence
39013900
_FwdIt _Ch0 = _Pat;
39023901
_Mchar = static_cast<_Meta_type>(_Char = *++_Ch0);
39033902
}
@@ -3941,9 +3940,28 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-char
39413940

39423941
case _Meta_dlr:
39433942
{ // check if $ is special
3944-
_FwdIt _Ch0 = _Pat;
3945-
if ((_L_flags & _L_anch_rstr) && ++_Ch0 != _End && *_Ch0 != _Meta_nl) {
3946-
_Mchar = _Meta_chr;
3943+
_FwdIt _Next = _Pat;
3944+
if ((_L_flags & _L_anch_rstr) && ++_Next != _End) {
3945+
const bool _Escaped = *_Next == _Meta_esc && _Is_esc(_Next);
3946+
if (_Escaped) {
3947+
++_Next;
3948+
}
3949+
3950+
// Only the basic and grep grammars set _L_anch_rstr, so _L_alt_pipe and _L_nex_grp must be unset.
3951+
// Therefore, we don't need to handle "dollar followed by pipe '|' for alternation"
3952+
// or "dollar followed by non-escaped right parenthesis ')' closing a group" below.
3953+
_STL_INTERNAL_CHECK((_L_flags & (_L_alt_pipe | _L_nex_grp)) == 0);
3954+
3955+
const _Elem _Ch = *_Next;
3956+
const bool _Is_end_of_alternative =
3957+
((_L_flags & _L_alt_nl) && _Ch == _Meta_nl
3958+
&& _Disj_count == 0) // dollar followed by newline '\n' for alternation
3959+
|| (_Escaped && _Ch == _Meta_rpar
3960+
&& _Disj_count != 0); // dollar followed by (escaped) right parenthesis ')' closing a group
3961+
3962+
if (!_Is_end_of_alternative) {
3963+
_Mchar = _Meta_chr;
3964+
}
39473965
}
39483966

39493967
break;
@@ -3972,7 +3990,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-char
39723990
template <class _FwdIt, class _Elem, class _RxTraits>
39733991
void _Parser<_FwdIt, _Elem, _RxTraits>::_Next() { // advance to next input character
39743992
if (_Pat != _End) { // advance
3975-
if (*_Pat == _Meta_esc && _Is_esc()) {
3993+
if (*_Pat == _Meta_esc && _Is_esc(_Pat)) {
39763994
++_Pat;
39773995
}
39783996

tests/std/tests/VSO_0000000_regex_use/test.cpp

Lines changed: 191 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1031,7 +1031,6 @@ void test_gh_5165_grep() {
10311031
middle_nl_with_caret.should_search_fail("^a");
10321032
middle_nl_with_caret.should_search_fail("ca");
10331033
middle_nl_with_caret.should_search_fail("^b");
1034-
middle_nl_with_caret.should_search_fail("ca");
10351034
middle_nl_with_caret.should_search_fail("cb");
10361035
}
10371036
{
@@ -1171,6 +1170,196 @@ void test_gh_5253() {
11711170
g_regexTester.should_not_match("a", "()*");
11721171
}
11731172

1173+
void test_gh_5362_syntax_option(const syntax_option_type basic_or_grep) {
1174+
{
1175+
const test_regex ending_anchor(&g_regexTester, "meo[wW]$", basic_or_grep);
1176+
ending_anchor.should_search_match("kitten_meow", "meow");
1177+
ending_anchor.should_search_fail("homeowner");
1178+
}
1179+
{
1180+
const test_regex middle_anchor(&g_regexTester, "me$o[wW]", basic_or_grep);
1181+
middle_anchor.should_search_fail("kitten_meow");
1182+
middle_anchor.should_search_fail("homeowner");
1183+
middle_anchor.should_search_match("home$owner", "me$ow");
1184+
}
1185+
{
1186+
const test_regex double_dollars(&g_regexTester, "meo[wW]$$", basic_or_grep);
1187+
double_dollars.should_search_fail("kitten_meow");
1188+
double_dollars.should_search_fail("homeowner");
1189+
double_dollars.should_search_match("kitten_meow$", "meow$");
1190+
double_dollars.should_search_fail("kitten_meow$$");
1191+
double_dollars.should_search_fail("homeow$ner");
1192+
double_dollars.should_search_fail("homeow$$ner");
1193+
}
1194+
1195+
g_regexTester.should_not_match("me$ow", R"(\(me$\)o[wW])", basic_or_grep);
1196+
g_regexTester.should_not_match("meow", R"(\(me$\)o[wW])", basic_or_grep);
1197+
1198+
{
1199+
const test_regex singlegroup_anchor(&g_regexTester, R"(\(meo[wW]$\))", basic_or_grep);
1200+
singlegroup_anchor.should_search_match("kitten_meow", "meow");
1201+
singlegroup_anchor.should_search_fail("kitten_meow$");
1202+
singlegroup_anchor.should_search_fail("homeowner");
1203+
singlegroup_anchor.should_search_fail("homeow$ner");
1204+
}
1205+
{
1206+
const test_regex suffixedgroup_anchor(&g_regexTester, R"(\(meo[wW]$\).*)", basic_or_grep);
1207+
suffixedgroup_anchor.should_search_match("kitten_meow", "meow");
1208+
suffixedgroup_anchor.should_search_fail("kitten_meow$");
1209+
suffixedgroup_anchor.should_search_fail("homeowner");
1210+
suffixedgroup_anchor.should_search_fail("homeow$ner");
1211+
}
1212+
{
1213+
const test_regex firstgroup_anchor(&g_regexTester, R"(\(meo[wW]$\)\(.*\))", basic_or_grep);
1214+
firstgroup_anchor.should_search_match("kitten_meow", "meow");
1215+
firstgroup_anchor.should_search_fail("kitten_meow$");
1216+
firstgroup_anchor.should_search_fail("homeowner");
1217+
firstgroup_anchor.should_search_fail("homeow$ner");
1218+
}
1219+
{
1220+
const test_regex nested_anchor(&g_regexTester, R"(\(\(meo[wW]$\)$\).*)", basic_or_grep);
1221+
nested_anchor.should_search_match("kitten_meow", "meow");
1222+
nested_anchor.should_search_fail("kitten_meow$");
1223+
nested_anchor.should_search_fail("kitten_meow$$");
1224+
nested_anchor.should_search_fail("homeowner");
1225+
nested_anchor.should_search_fail("homeow$ner");
1226+
nested_anchor.should_search_fail("homeow$$ner");
1227+
}
1228+
{
1229+
const test_regex double_dollars(&g_regexTester, R"(\(meo[wW]$$\).*)", basic_or_grep);
1230+
double_dollars.should_search_fail("kitten_meow");
1231+
double_dollars.should_search_match("kitten_meow$", "meow$");
1232+
double_dollars.should_search_fail("kitten_meow$$");
1233+
double_dollars.should_search_fail("homeowner");
1234+
double_dollars.should_search_fail("homeow$ner");
1235+
double_dollars.should_search_fail("homeow$$ner");
1236+
}
1237+
1238+
// Validate that there is no special behavior near bars,
1239+
// as they are alternation operators in regex modes other than basic or grep.
1240+
{
1241+
const test_regex middle_bar(&g_regexTester, "a|a$", basic_or_grep);
1242+
middle_bar.should_search_match("a|a", "a|a");
1243+
middle_bar.should_search_fail("a|a$");
1244+
middle_bar.should_search_fail("a|ab");
1245+
middle_bar.should_search_fail("a");
1246+
}
1247+
{
1248+
const test_regex group_middle_bar(&g_regexTester, R"(\(a|a\)$)", basic_or_grep);
1249+
group_middle_bar.should_search_match("a|a", "a|a");
1250+
group_middle_bar.should_search_fail("a|a$");
1251+
group_middle_bar.should_search_fail("a|ab");
1252+
group_middle_bar.should_search_fail("a");
1253+
}
1254+
{
1255+
const test_regex middle_bar_with_dollar(&g_regexTester, "a$|b$", basic_or_grep);
1256+
middle_bar_with_dollar.should_search_match("a$|b", "a$|b");
1257+
middle_bar_with_dollar.should_search_fail("a|b");
1258+
middle_bar_with_dollar.should_search_fail("a$|b$");
1259+
middle_bar_with_dollar.should_search_fail("a$|bc");
1260+
middle_bar_with_dollar.should_search_fail("a");
1261+
middle_bar_with_dollar.should_search_fail("b");
1262+
}
1263+
{
1264+
const test_regex group_middle_bar_with_dollar(&g_regexTester, R"(\(a$|b\)$)", basic_or_grep);
1265+
group_middle_bar_with_dollar.should_search_match("a$|b", "a$|b");
1266+
group_middle_bar_with_dollar.should_search_fail("a|b");
1267+
group_middle_bar_with_dollar.should_search_fail("a$|b$");
1268+
group_middle_bar_with_dollar.should_search_fail("a$|bc");
1269+
group_middle_bar_with_dollar.should_search_fail("a");
1270+
group_middle_bar_with_dollar.should_search_fail("b");
1271+
}
1272+
}
1273+
1274+
void test_gh_5362_basic() {
1275+
// test cases specific for basic regular expressions
1276+
{
1277+
const test_regex middle_nl(&g_regexTester, "a\na$", basic);
1278+
middle_nl.should_search_match("a\na", "a\na");
1279+
middle_nl.should_search_fail("a\na$");
1280+
middle_nl.should_search_fail("a\nab");
1281+
middle_nl.should_search_fail("a");
1282+
}
1283+
{
1284+
const test_regex group_middle_nl(&g_regexTester, "\\(a\na\\)$", basic);
1285+
group_middle_nl.should_search_match("a\na", "a\na");
1286+
group_middle_nl.should_search_fail("a\na$");
1287+
group_middle_nl.should_search_fail("a\nab");
1288+
group_middle_nl.should_search_fail("a");
1289+
}
1290+
{
1291+
const test_regex middle_nl_with_dollar(&g_regexTester, "a$\nb$", basic);
1292+
middle_nl_with_dollar.should_search_match("a$\nb", "a$\nb");
1293+
middle_nl_with_dollar.should_search_fail("a\nb");
1294+
middle_nl_with_dollar.should_search_fail("a$\nb$");
1295+
middle_nl_with_dollar.should_search_fail("a$\nbc");
1296+
middle_nl_with_dollar.should_search_fail("a");
1297+
middle_nl_with_dollar.should_search_fail("b");
1298+
}
1299+
{
1300+
const test_regex group_middle_nl_with_dollar(&g_regexTester, "\\(a$\nb\\)$", basic);
1301+
group_middle_nl_with_dollar.should_search_match("a$\nb", "a$\nb");
1302+
group_middle_nl_with_dollar.should_search_fail("a\nb");
1303+
group_middle_nl_with_dollar.should_search_fail("a$\nb$");
1304+
group_middle_nl_with_dollar.should_search_fail("a$\nbc");
1305+
group_middle_nl_with_dollar.should_search_fail("a");
1306+
group_middle_nl_with_dollar.should_search_fail("b");
1307+
}
1308+
}
1309+
1310+
void test_gh_5362_grep() {
1311+
// test cases specific for grep mode
1312+
{
1313+
const test_regex middle_nl(&g_regexTester, "a\na$", grep);
1314+
middle_nl.should_search_match("a\na$", "a");
1315+
middle_nl.should_search_match("a\nab", "a");
1316+
middle_nl.should_search_match("a", "a");
1317+
middle_nl.should_search_fail("b");
1318+
}
1319+
{
1320+
// This regular expression is not accepted by POSIX grep, but currently the regex parser does not reject it.
1321+
// If the parser is changed to reject it, adjust this test case.
1322+
const test_regex group_middle_nl(&g_regexTester, "\\(a\na\\)$", grep);
1323+
group_middle_nl.should_search_match("a\na", "a\na");
1324+
group_middle_nl.should_search_fail("a\na$");
1325+
group_middle_nl.should_search_fail("a\nac");
1326+
group_middle_nl.should_search_fail("a");
1327+
}
1328+
{
1329+
const test_regex middle_nl_with_dollar(&g_regexTester, "a$\nb$", grep);
1330+
middle_nl_with_dollar.should_search_match("a$\nb", "b");
1331+
middle_nl_with_dollar.should_search_match("a\nb", "a");
1332+
middle_nl_with_dollar.should_search_match("ba", "a");
1333+
middle_nl_with_dollar.should_search_match("a", "a");
1334+
middle_nl_with_dollar.should_search_match("b", "b");
1335+
middle_nl_with_dollar.should_search_match("ab", "b");
1336+
middle_nl_with_dollar.should_search_fail("a$");
1337+
middle_nl_with_dollar.should_search_fail("ac");
1338+
middle_nl_with_dollar.should_search_fail("b$");
1339+
middle_nl_with_dollar.should_search_fail("bc");
1340+
}
1341+
{
1342+
// This regular expression is not accepted by POSIX grep, but currently the regex parser does not reject it.
1343+
// If the parser is changed to reject it, adjust this test case.
1344+
const test_regex group_middle_nl_with_dollar(&g_regexTester, "\\(a$\nb\\)$", grep);
1345+
group_middle_nl_with_dollar.should_search_match("a$\nb", "a$\nb");
1346+
group_middle_nl_with_dollar.should_search_fail("a\nb");
1347+
group_middle_nl_with_dollar.should_search_fail("a$\nb$");
1348+
group_middle_nl_with_dollar.should_search_fail("a$\nbc");
1349+
group_middle_nl_with_dollar.should_search_fail("a");
1350+
group_middle_nl_with_dollar.should_search_fail("b");
1351+
}
1352+
}
1353+
1354+
void test_gh_5362() {
1355+
// GH-5362: `<regex>`: Properly parse dollar anchors in basic and grep mode
1356+
test_gh_5362_syntax_option(basic);
1357+
test_gh_5362_syntax_option(grep);
1358+
1359+
test_gh_5362_basic();
1360+
test_gh_5362_grep();
1361+
}
1362+
11741363
int main() {
11751364
test_dev10_449367_case_insensitivity_should_work();
11761365
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
@@ -1208,6 +1397,7 @@ int main() {
12081397
test_gh_5192();
12091398
test_gh_5214();
12101399
test_gh_5253();
1400+
test_gh_5362();
12111401

12121402
return g_regexTester.result();
12131403
}

0 commit comments

Comments
 (0)