Skip to content

Commit c0f5f35

Browse files
<regex>: Make wregex correctly match negated character classes (#5403)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 608036c commit c0f5f35

File tree

2 files changed

+197
-19
lines changed

2 files changed

+197
-19
lines changed

stl/inc/regex

Lines changed: 66 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,10 +1202,16 @@ _INLINE_VAR constexpr unsigned int _Bmp_size = (_Bmp_max + _Bmp_chrs - 1U) / _B
12021202
_INLINE_VAR constexpr unsigned int _ARRAY_THRESHOLD = 4U;
12031203

12041204
enum _Node_flags : int { // flags for nfa nodes with special properties
1205-
_Fl_none = 0x00,
1206-
_Fl_negate = 0x01,
1207-
_Fl_greedy = 0x02,
1208-
_Fl_longest = 0x08 // TRANSITION, ABI: 0x04 is unused; the parser previously marked some nodes with it
1205+
_Fl_none = 0x000,
1206+
_Fl_negate = 0x001,
1207+
_Fl_greedy = 0x002,
1208+
_Fl_longest = 0x008, // TRANSITION, ABI: 0x004 is unused; the parser previously marked some nodes with it
1209+
_Fl_class_negated_w = 0x100,
1210+
_Fl_class_negated_s = 0x200,
1211+
_Fl_class_negated_d = 0x400,
1212+
_Fl_begin_needs_w = 0x100,
1213+
_Fl_begin_needs_s = 0x200,
1214+
_Fl_begin_needs_d = 0x400
12091215
};
12101216

12111217
_BITMASK_OPS(_EMPTY_ARGUMENT, _Node_flags)
@@ -1507,6 +1513,13 @@ public:
15071513
_Node_rep& operator=(const _Node_rep&) = delete;
15081514
};
15091515

1516+
enum class _Rx_char_class_kind : int { // must be aligned with corresponding _Node_flags
1517+
_Positive = 0,
1518+
_Negated_w = _Fl_class_negated_w,
1519+
_Negated_s = _Fl_class_negated_s,
1520+
_Negated_d = _Fl_class_negated_d
1521+
};
1522+
15101523
template <class _FwdIt, class _Elem, class _RxTraits>
15111524
class _Builder { // provides operations used by _Parser to build the nfa
15121525
public:
@@ -1525,7 +1538,7 @@ public:
15251538
void _Add_class();
15261539
void _Add_char_to_class(_Elem _Ch);
15271540
void _Add_range2(_Elem, _Elem);
1528-
void _Add_named_class(typename _RxTraits::char_class_type, bool);
1541+
void _Add_named_class(typename _RxTraits::char_class_type, _Rx_char_class_kind);
15291542
void _Add_equiv2(const _Elem*, const _Elem*);
15301543
void _Add_coll2(const _Elem*, const _Elem*);
15311544
_Node_base* _Begin_group();
@@ -1680,6 +1693,7 @@ private:
16801693
bool _Match_pat(_Node_base*);
16811694
bool _Better_match();
16821695
bool _Is_wbound() const;
1696+
typename _RxTraits::char_class_type _Lookup_char_class(_Elem) const;
16831697

16841698
unsigned int _Get_ncap() const;
16851699

@@ -2944,12 +2958,19 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_elts(
29442958
}
29452959

29462960
template <class _FwdIt, class _Elem, class _RxTraits>
2947-
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_named_class(typename _RxTraits::char_class_type _Cl,
2948-
bool _Negate) { // add contents of named class to bracket expression
2961+
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_named_class(
2962+
typename _RxTraits::char_class_type _Cl, const _Rx_char_class_kind _Kind) {
2963+
// add contents of named class to bracket expression
29492964
_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Current);
2950-
_Add_elts(_Node, _Cl, _Negate);
2951-
if (_Bmp_max <= _STD _Max_limit<typename _RxTraits::_Uelem>() && !_Negate) {
2952-
_Node->_Classes = static_cast<typename _RxTraits::char_class_type>(_Node->_Classes | _Cl);
2965+
_Add_elts(_Node, _Cl, _Kind != _Rx_char_class_kind::_Positive);
2966+
if (_Bmp_max <= _STD _Max_limit<typename _RxTraits::_Uelem>()) {
2967+
if (_Kind == _Rx_char_class_kind::_Positive) {
2968+
_Node->_Classes = static_cast<typename _RxTraits::char_class_type>(_Node->_Classes | _Cl);
2969+
} else {
2970+
auto _Node_flag = static_cast<_Node_flags>(_Kind);
2971+
_Node->_Flags |= _Node_flag;
2972+
_Root->_Flags |= _Node_flag;
2973+
}
29532974
}
29542975
}
29552976

@@ -3495,6 +3516,15 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap
34953516
_Found = true;
34963517
} else if (_Node->_Equiv && _STD _Lookup_equiv2(_Ch, _Node->_Equiv, _Traits)) {
34973518
_Found = true;
3519+
} else if ((_Node->_Flags & _Fl_class_negated_w)
3520+
&& !_Traits.isctype(_Ch, _Lookup_char_class(static_cast<_Elem>('W')))) {
3521+
_Found = true;
3522+
} else if ((_Node->_Flags & _Fl_class_negated_s)
3523+
&& !_Traits.isctype(_Ch, _Lookup_char_class(static_cast<_Elem>('S')))) {
3524+
_Found = true;
3525+
} else if ((_Node->_Flags & _Fl_class_negated_d)
3526+
&& !_Traits.isctype(_Ch, _Lookup_char_class(static_cast<_Elem>('D')))) {
3527+
_Found = true;
34983528
} else {
34993529
_Found = false;
35003530
}
@@ -3552,6 +3582,14 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Is_wbound() const {
35523582
}
35533583
}
35543584

3585+
template <class _BidIt, class _Elem, class _RxTraits, class _It>
3586+
typename _RxTraits::char_class_type _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Lookup_char_class(
3587+
const _Elem _Class_name) const {
3588+
// look up character class with single-character name
3589+
auto _Ptr = _STD addressof(_Class_name);
3590+
return _Traits.lookup_classname(_Ptr, _Ptr + 1, (_Sflags & regex_constants::icase) != 0);
3591+
}
3592+
35553593
template <class _BidIt, class _Elem, class _RxTraits, class _It>
35563594
unsigned int _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Get_ncap() const {
35573595
return static_cast<unsigned int>(_Ncap);
@@ -3854,6 +3892,15 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt
38543892
_Found = true;
38553893
} else if (_Node->_Equiv && _STD _Lookup_equiv2(_Ch, _Node->_Equiv, _Traits)) {
38563894
_Found = true;
3895+
} else if ((_Node->_Flags & _Fl_class_negated_w)
3896+
&& !_Traits.isctype(_Ch, _Lookup_char_class(static_cast<_Elem>('W')))) {
3897+
_Found = true;
3898+
} else if ((_Node->_Flags & _Fl_class_negated_s)
3899+
&& !_Traits.isctype(_Ch, _Lookup_char_class(static_cast<_Elem>('S')))) {
3900+
_Found = true;
3901+
} else if ((_Node->_Flags & _Fl_class_negated_d)
3902+
&& !_Traits.isctype(_Ch, _Lookup_char_class(static_cast<_Elem>('D')))) {
3903+
_Found = true;
38573904
} else {
38583905
_Found = false;
38593906
}
@@ -4118,7 +4165,7 @@ _Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_Do_ex_class2(
41184165
_Error(regex_constants::error_ctype);
41194166
}
41204167

4121-
_Nfa._Add_named_class(_Cls, false);
4168+
_Nfa._Add_named_class(_Cls, _Rx_char_class_kind::_Positive);
41224169
return _Prs_set;
41234170
} else {
41244171
typename _RxTraits::string_type _Coll_elem = _Traits.lookup_collatename(_Beg, _End);
@@ -4168,18 +4215,23 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_CharacterClassEscape(bool _Addit) { //
41684215
return false;
41694216
}
41704217

4171-
const bool _Negated = _Traits.isctype(_Char, _RxTraits::_Ch_upper);
4218+
auto _Kind = (_Char == 'W' ? _Rx_char_class_kind::_Negated_w
4219+
: _Char == 'S' ? _Rx_char_class_kind::_Negated_s
4220+
: _Char == 'D' ? _Rx_char_class_kind::_Negated_d
4221+
: _Rx_char_class_kind::_Positive);
4222+
41724223
if (_Addit) {
41734224
_Nfa._Add_class();
41744225
// GH-992: Outside character class definitions, _Cls completely defines the character class
41754226
// so negating _Cls and negating the entire character class are equivalent.
41764227
// Since the former negation is defective, do the latter instead.
4177-
if (_Negated) {
4228+
if (_Kind != _Rx_char_class_kind::_Positive) {
41784229
_Nfa._Negate();
4230+
_Kind = _Rx_char_class_kind::_Positive;
41794231
}
41804232
}
41814233

4182-
_Nfa._Add_named_class(_Cls, _Negated && !_Addit);
4234+
_Nfa._Add_named_class(_Cls, _Kind);
41834235
_Next();
41844236
return true;
41854237
}

tests/std/tests/VSO_0000000_regex_use/test.cpp

Lines changed: 131 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,91 @@ void test_gh_731() {
636636
}
637637
}
638638

639+
void test_gh_992() {
640+
// GH-992 <regex> mishandles locale-based character classes outside of the char range
641+
{
642+
const test_wregex neg_w_regex(&g_regexTester, LR"(Y[\W]*Z)");
643+
neg_w_regex.should_search_match(L"xxxY Zxxx", L"Y Z");
644+
neg_w_regex.should_search_match(L"xxxY \x2009 Zxxx", L"Y \x2009 Z"); // U+2009 THIN SPACE
645+
neg_w_regex.should_search_fail(L"xxxY \x0078 Zxxx"); // U+0078 LATIN SMALL LETTER X
646+
neg_w_regex.should_search_fail(L"xxxY \x03C7 Zxxx"); // U+03C7 GREEK SMALL LETTER CHI
647+
neg_w_regex.should_search_fail(L"xxxY 3 Zxxx");
648+
neg_w_regex.should_search_fail(L"xxxY \x0662 Zxxx"); // U+0662 ARABIC-INDIC DIGIT TWO
649+
}
650+
{
651+
const test_wregex neg_s_regex(&g_regexTester, LR"(Y[\S]*Z)");
652+
neg_s_regex.should_search_match(L"xxxYxx\x0078xxxZxxx", L"Yxx\x0078xxxZ"); // U+0078 LATIN SMALL LETTER X
653+
neg_s_regex.should_search_match(L"xxxYxx\x03C7xxxZxxx", L"Yxx\x03C7xxxZ"); // U+03C7 GREEK SMALL LETTER CHI
654+
neg_s_regex.should_search_match(L"xxxYxx3xxxZxxx", L"Yxx3xxxZ");
655+
neg_s_regex.should_search_match(L"xxxYxx\x0662xxxZxxx", L"Yxx\x0662xxxZ"); // U+0662 ARABIC-INDIC DIGIT TWO
656+
neg_s_regex.should_search_fail(L"xxxYxx xxxZxxx");
657+
neg_s_regex.should_search_fail(L"xxxYxx\x2009xxxZxxx"); // U+2009 THIN SPACE
658+
}
659+
for (const wstring& pattern : {LR"(Y[\D]*Z)", LR"(Y[\W\D]*Z)"}) {
660+
const test_wregex neg_d_regex(&g_regexTester, pattern);
661+
neg_d_regex.should_search_match(L"xxxYxx\x0078xxxZxxx", L"Yxx\x0078xxxZ"); // U+0078 LATIN SMALL LETTER X
662+
neg_d_regex.should_search_match(L"xxxYxx\x03C7xxxZxxx", L"Yxx\x03C7xxxZ"); // U+03C7 GREEK SMALL LETTER CHI
663+
neg_d_regex.should_search_match(L"xxxYxx xxxZxxx", L"Yxx xxxZ");
664+
neg_d_regex.should_search_match(L"xxxYxx\x2009xxxZxxx", L"Yxx\x2009xxxZ"); // U+2009 THIN SPACE
665+
neg_d_regex.should_search_fail(L"xxxYxx3xxxZxxx");
666+
neg_d_regex.should_search_fail(L"xxxYxx\x0662xxxZxxx"); // U+0662 ARABIC-INDIC DIGIT TWO
667+
}
668+
for (const wstring& pattern : {LR"(Y[\W\S]*Z)", LR"(Y[\S\D]*Z)", LR"(Y[\W\S\D]*Z)"}) {
669+
const test_wregex class_matches_all_regex(&g_regexTester, pattern);
670+
class_matches_all_regex.should_search_match(
671+
L"xxxYxx\x0078xxxZxxx", L"Yxx\x0078xxxZ"); // U+0078 LATIN SMALL LETTER X
672+
class_matches_all_regex.should_search_match(
673+
L"xxxYxx\x03C7xxxZxxx", L"Yxx\x03C7xxxZ"); // U+03C7 GREEK SMALL LETTER CHI
674+
class_matches_all_regex.should_search_match(L"xxxYxx xxxZxxx", L"Yxx xxxZ");
675+
class_matches_all_regex.should_search_match(L"xxxYxx\x2009xxxZxxx", L"Yxx\x2009xxxZ"); // U+2009 THIN SPACE
676+
class_matches_all_regex.should_search_match(L"xxxYxx3xxxZxxx", L"Yxx3xxxZ");
677+
class_matches_all_regex.should_search_match(
678+
L"xxxYxx\x0662xxxZxxx", L"Yxx\x0662xxxZ"); // U+0662 ARABIC-INDIC DIGIT TWO
679+
}
680+
{
681+
const test_wregex neg_w_regex_skip(&g_regexTester, LR"([\W])");
682+
neg_w_regex_skip.should_search_match(L"xxxx\x2009xxxx", L"\x2009"); // U+2009 THIN SPACE
683+
neg_w_regex_skip.should_search_fail(L"xxxx\x03C7xxxx"); // U+03C7 GREEK SMALL LETTER CHI
684+
neg_w_regex_skip.should_search_fail(L"xxxx\x0662xxxx"); // U+0662 ARABIC-INDIC DIGIT TWO
685+
}
686+
{
687+
const test_wregex neg_s_regex_skip(&g_regexTester, LR"([\S])");
688+
neg_s_regex_skip.should_search_match(L" \x03C7 ", L"\x03C7"); // U+03C7 GREEK SMALL LETTER CHI
689+
neg_s_regex_skip.should_search_match(L" \x0662 ", L"\x0662"); // U+0662 ARABIC-INDIC DIGIT TWO
690+
neg_s_regex_skip.should_search_fail(L" \x2009 "); // U+2009 THIN SPACE
691+
}
692+
{
693+
const test_wregex neg_d_regex_skip(&g_regexTester, LR"([\D])");
694+
neg_d_regex_skip.should_search_match(L"1623\x03C7"s + L"253", L"\x03C7"); // U+03C7 GREEK SMALL LETTER CHI
695+
neg_d_regex_skip.should_search_match(L"1623\x2009"s + L"253", L"\x2009"); // U+2009 THIN SPACE
696+
neg_d_regex_skip.should_search_fail(L"1623\x0662"s + L"253"); // U+0662 ARABIC-INDIC DIGIT TWO
697+
}
698+
{
699+
const test_wregex double_negative_w(&g_regexTester, LR"([^\W])");
700+
double_negative_w.should_search_match(L"\x03C7", L"\x03C7"); // U+03C7 GREEK SMALL LETTER CHI
701+
double_negative_w.should_search_match(L"\x0662", L"\x0662"); // U+0662 ARABIC-INDIC DIGIT TWO
702+
double_negative_w.should_search_fail(L"\x2009"); // U+2009 THIN SPACE
703+
}
704+
{
705+
const test_wregex double_negative_s(&g_regexTester, LR"([^\S])");
706+
double_negative_s.should_search_fail(L"\x03C7"); // U+03C7 GREEK SMALL LETTER CHI
707+
double_negative_s.should_search_fail(L"\x0662"); // U+0662 ARABIC-INDIC DIGIT TWO
708+
double_negative_s.should_search_match(L"\x2009", L"\x2009"); // U+2009 THIN SPACE
709+
}
710+
{
711+
const test_wregex double_negative_d(&g_regexTester, LR"([^\D])");
712+
double_negative_d.should_search_fail(L"\x03C7"); // U+03C7 GREEK SMALL LETTER CHI
713+
double_negative_d.should_search_match(L"\x0662", L"\x0662"); // U+0662 ARABIC-INDIC DIGIT TWO
714+
double_negative_d.should_search_fail(L"\x2009"); // U+2009 THIN SPACE
715+
}
716+
for (const wstring& pattern : {LR"([\w\W])", LR"([\s\S])", LR"([\d\D])"}) {
717+
const test_wregex omni_regex(&g_regexTester, pattern);
718+
omni_regex.should_search_match(L"\x03C7", L"\x03C7"); // U+03C7 GREEK SMALL LETTER CHI
719+
omni_regex.should_search_match(L"\x0662", L"\x0662"); // U+0662 ARABIC-INDIC DIGIT TWO
720+
omni_regex.should_search_match(L"\x2009", L"\x2009"); // U+2009 THIN SPACE
721+
}
722+
}
723+
639724
void test_gh_993() {
640725
// GH-993 regex::icase is not handled correctly for some input.
641726
{
@@ -751,11 +836,51 @@ void test_gh_5058() {
751836
void test_gh_5160() {
752837
// GH-5160 fixed mishandled negated character class escapes
753838
// outside character class definitions
754-
const test_wregex neg_regex(&g_regexTester, LR"(Y\S*Z)");
755-
neg_regex.should_search_match(L"xxxYxx\x0078xxxZxxx", L"Yxx\x0078xxxZ"); // U+0078 LATIN SMALL LETTER X
756-
neg_regex.should_search_match(L"xxxYxx\x03C7xxxZxxx", L"Yxx\x03C7xxxZ"); // U+03C7 GREEK SMALL LETTER CHI
757-
neg_regex.should_search_fail(L"xxxYxx xxxZxxx");
758-
neg_regex.should_search_fail(L"xxxYxx\x2009xxxZxxx"); // U+2009 THIN SPACE
839+
{
840+
const test_wregex neg_w_regex(&g_regexTester, LR"(Y\W*Z)");
841+
neg_w_regex.should_search_match(L"xxxY Zxxx", L"Y Z");
842+
neg_w_regex.should_search_match(L"xxxY \x2009 Zxxx", L"Y \x2009 Z"); // U+2009 THIN SPACE
843+
neg_w_regex.should_search_fail(L"xxxY \x0078 Zxxx"); // U+0078 LATIN SMALL LETTER X
844+
neg_w_regex.should_search_fail(L"xxxY \x03C7 Zxxx"); // U+03C7 GREEK SMALL LETTER CHI
845+
neg_w_regex.should_search_fail(L"xxxY 3 Zxxx");
846+
neg_w_regex.should_search_fail(L"xxxY \x0662 Zxxx"); // U+0662 ARABIC-INDIC DIGIT TWO
847+
}
848+
{
849+
const test_wregex neg_s_regex(&g_regexTester, LR"(Y\S*Z)");
850+
neg_s_regex.should_search_match(L"xxxYxx\x0078xxxZxxx", L"Yxx\x0078xxxZ"); // U+0078 LATIN SMALL LETTER X
851+
neg_s_regex.should_search_match(L"xxxYxx\x03C7xxxZxxx", L"Yxx\x03C7xxxZ"); // U+03C7 GREEK SMALL LETTER CHI
852+
neg_s_regex.should_search_match(L"xxxYxx3xxxZxxx", L"Yxx3xxxZ");
853+
neg_s_regex.should_search_match(L"xxxYxx\x0662xxxZxxx", L"Yxx\x0662xxxZ"); // U+0662 ARABIC-INDIC DIGIT TWO
854+
neg_s_regex.should_search_fail(L"xxxYxx xxxZxxx");
855+
neg_s_regex.should_search_fail(L"xxxYxx\x2009xxxZxxx"); // U+2009 THIN SPACE
856+
}
857+
{
858+
const test_wregex neg_d_regex(&g_regexTester, LR"(Y\D*Z)");
859+
neg_d_regex.should_search_match(L"xxxYxx\x0078xxxZxxx", L"Yxx\x0078xxxZ"); // U+0078 LATIN SMALL LETTER X
860+
neg_d_regex.should_search_match(L"xxxYxx\x03C7xxxZxxx", L"Yxx\x03C7xxxZ"); // U+03C7 GREEK SMALL LETTER CHI
861+
neg_d_regex.should_search_match(L"xxxYxx xxxZxxx", L"Yxx xxxZ");
862+
neg_d_regex.should_search_match(L"xxxYxx\x2009xxxZxxx", L"Yxx\x2009xxxZ"); // U+2009 THIN SPACE
863+
neg_d_regex.should_search_fail(L"xxxYxx3xxxZxxx");
864+
neg_d_regex.should_search_fail(L"xxxYxx\x0662xxxZxxx"); // U+0662 ARABIC-INDIC DIGIT TWO
865+
}
866+
{
867+
const test_wregex neg_w_regex_skip(&g_regexTester, LR"(\W)");
868+
neg_w_regex_skip.should_search_match(L"xxxx\x2009xxxx", L"\x2009"); // U+2009 THIN SPACE
869+
neg_w_regex_skip.should_search_fail(L"xxxx\x03C7xxxx"); // U+03C7 GREEK SMALL LETTER CHI
870+
neg_w_regex_skip.should_search_fail(L"xxxx\x0662xxxx"); // U+0662 ARABIC-INDIC DIGIT TWO
871+
}
872+
{
873+
const test_wregex neg_s_regex_skip(&g_regexTester, LR"(\S)");
874+
neg_s_regex_skip.should_search_match(L" \x03C7 ", L"\x03C7"); // U+03C7 GREEK SMALL LETTER CHI
875+
neg_s_regex_skip.should_search_match(L" \x0662 ", L"\x0662"); // U+0662 ARABIC-INDIC DIGIT TWO
876+
neg_s_regex_skip.should_search_fail(L" \x2009 "); // U+2009 THIN SPACE
877+
}
878+
{
879+
const test_wregex neg_d_regex_skip(&g_regexTester, LR"(\D)");
880+
neg_d_regex_skip.should_search_match(L"1623\x03C7"s + L"253", L"\x03C7"); // U+03C7 GREEK SMALL LETTER CHI
881+
neg_d_regex_skip.should_search_match(L"1623\x2009"s + L"253", L"\x2009"); // U+2009 THIN SPACE
882+
neg_d_regex_skip.should_search_fail(L"1623\x0662"s + L"253"); // U+0662 ARABIC-INDIC DIGIT TWO
883+
}
759884
}
760885

761886
void test_gh_5165_syntax_option(const syntax_option_type basic_or_grep) {
@@ -1526,6 +1651,7 @@ int main() {
15261651
test_VSO_226914_word_boundaries();
15271652
test_construction_from_nullptr_and_zero();
15281653
test_gh_731();
1654+
test_gh_992();
15291655
test_gh_993();
15301656
test_gh_4995();
15311657
test_gh_5058();

0 commit comments

Comments
 (0)