Skip to content

Commit 8400f12

Browse files
committed
Support auto-atomicity on {lazy} loops
Today we only support conversion of loops to atomic loops for single-character loops (e.g. a* or [abc]*). This PR augments the logic to support arbitrary loops, enabling many more loops to become atomic.
1 parent 97d5ac8 commit 8400f12

File tree

2 files changed

+158
-34
lines changed

2 files changed

+158
-34
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 120 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ private void MakeRep(RegexNodeKind kind, int min, int max)
154154
N = max;
155155
}
156156

157-
private void MakeLoopAtomic()
157+
private void MakeLoopAtomic(bool isEnding = false)
158158
{
159159
switch (Kind)
160160
{
@@ -165,11 +165,15 @@ private void MakeLoopAtomic()
165165
break;
166166

167167
case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy:
168-
// For lazy, we not only change the Type, we also lower the max number of iterations
169-
// to the minimum number of iterations, creating a repeater, as they should end up
170-
// matching as little as possible.
168+
// For lazy, we not only change the Type, if we're at the end of the pattern,
169+
// we also lower the max number of iterations to the minimum number of iterations,
170+
// creating a repeater, as they should end up matching as little as possible.
171171
Kind += RegexNodeKind.Oneloopatomic - RegexNodeKind.Onelazy;
172-
N = M;
172+
if (isEnding)
173+
{
174+
N = M;
175+
}
176+
173177
if (N == 0)
174178
{
175179
// If moving the max to be the same as the min dropped it to 0, there's no
@@ -189,6 +193,40 @@ private void MakeLoopAtomic()
189193
}
190194
break;
191195

196+
case RegexNodeKind.Lazyloop:
197+
// For lazy, we not only change the Type, if we're at the end of the pattern,
198+
// we also lower the max number of iterations to the minimum number of iterations,
199+
// creating a repeater, as they should end up matching as little as possible.
200+
if (isEnding)
201+
{
202+
N = M;
203+
}
204+
205+
if (N == 0)
206+
{
207+
// If moving the max to be the same as the min dropped it to 0, there's no
208+
// work to be done for this node, and we can make it Empty.
209+
Kind = RegexNodeKind.Empty;
210+
Children = null;
211+
}
212+
else
213+
{
214+
// change it to be a greedy loop
215+
goto case RegexNodeKind.Loop;
216+
}
217+
break;
218+
219+
case RegexNodeKind.Loop:
220+
if (Parent is not { Kind: RegexNodeKind.Atomic })
221+
{
222+
RegexNode loopAsChild = new(RegexNodeKind.Loop, Options, M, N);
223+
Kind = RegexNodeKind.Atomic;
224+
M = N = 0;
225+
loopAsChild.AddChild(Child(0));
226+
ReplaceChild(0, loopAsChild);
227+
}
228+
break;
229+
192230
default:
193231
Debug.Fail($"Unexpected type: {Kind}");
194232
break;
@@ -430,7 +468,7 @@ private void EliminateEndingBacktracking()
430468
// or even empty nodes.
431469
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop:
432470
case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy:
433-
node.MakeLoopAtomic();
471+
node.MakeLoopAtomic(isEnding: true);
434472
break;
435473

436474
// Just because a particular node is atomic doesn't mean all its descendants are.
@@ -451,7 +489,7 @@ private void EliminateEndingBacktracking()
451489
case RegexNodeKind.Concatenate:
452490
RegexNode existingChild = node.Child(node.ChildCount() - 1);
453491
if ((existingChild.Kind is RegexNodeKind.Alternate or RegexNodeKind.BackreferenceConditional or RegexNodeKind.ExpressionConditional or RegexNodeKind.Loop or RegexNodeKind.Lazyloop) &&
454-
(node.Parent is null || node.Parent.Kind != RegexNodeKind.Atomic)) // validate grandparent isn't atomic
492+
node.Parent is not { Kind: RegexNodeKind.Atomic }) // validate grandparent isn't atomic
455493
{
456494
var atomic = new RegexNode(RegexNodeKind.Atomic, existingChild.Options);
457495
atomic.AddChild(existingChild);
@@ -493,20 +531,33 @@ private void EliminateEndingBacktracking()
493531
goto case RegexNodeKind.Loop;
494532
case RegexNodeKind.Loop:
495533
{
496-
if (node.N == 1)
534+
// Make the loop atomic, if it isn't already. This entails changing node to instead be an Atomic node
535+
// that has the {Lazy}Loop as its child. If the parent of the loop is already Atomic, this will be a nop.
536+
node.MakeLoopAtomic(isEnding: true);
537+
Debug.Assert(node.Kind is RegexNodeKind.Atomic or RegexNodeKind.Empty or RegexNodeKind.Loop or RegexNodeKind.Lazyloop);
538+
539+
if (node.Kind is RegexNodeKind.Atomic)
497540
{
498-
// If the loop has a max iteration count of 1 (e.g. it's an optional node),
499-
// there's no possibility for conflict between multiple iterations, so
500-
// we can process it.
501541
node = node.Child(0);
502-
continue;
503542
}
504543

505-
RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic();
506-
if (loopDescendent != null)
544+
if (node.Kind is RegexNodeKind.Loop or RegexNodeKind.Lazyloop)
507545
{
508-
node = loopDescendent;
509-
continue; // loop around to process node
546+
if (node.N == 1)
547+
{
548+
// If the loop has a max iteration count of 1 (e.g. it's an optional node),
549+
// there's no possibility for conflict between multiple iterations, so
550+
// we can process it.
551+
node = node.Child(0);
552+
continue;
553+
}
554+
555+
RegexNode? loopDescendent = node.FindLastExpressionInLoopForAutoAtomic();
556+
if (loopDescendent != null)
557+
{
558+
node = loopDescendent;
559+
continue; // loop around to process node
560+
}
510561
}
511562
}
512563
break;
@@ -635,7 +686,7 @@ private RegexNode ReduceAtomic()
635686
case RegexNodeKind.Onelazy:
636687
case RegexNodeKind.Notonelazy:
637688
case RegexNodeKind.Setlazy:
638-
child.MakeLoopAtomic();
689+
child.MakeLoopAtomic(isEnding: true);
639690
return child;
640691

641692
// Alternations have a variety of possible optimizations that can be applied
@@ -1876,7 +1927,7 @@ static void ProcessNode(RegexNode node, RegexNode subsequent)
18761927
// If the node can be changed to atomic based on what comes after it, do so.
18771928
switch (node.Kind)
18781929
{
1879-
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop when CanBeMadeAtomic(node, subsequent, iterateNullableSubsequent: true, allowLazy: false):
1930+
case RegexNodeKind.Oneloop or RegexNodeKind.Notoneloop or RegexNodeKind.Setloop or RegexNodeKind.Loop when CanBeMadeAtomic(node, subsequent, iterateNullableSubsequent: true, allowLazy: false):
18801931
// The greedy loop doesn't overlap with what comes after it, which means giving anything it matches back will not
18811932
// help the overall match to succeed, which means it can simply become atomic to match as much as possible. The call
18821933
// to CanBeMadeAtomic passes iterateNullableSubsequent=true because, in a pattern like a*b*c*, when analyzing a*, we
@@ -1885,7 +1936,7 @@ static void ProcessNode(RegexNode node, RegexNode subsequent)
18851936
node.MakeLoopAtomic();
18861937
break;
18871938

1888-
case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy when CanBeMadeAtomic(node, subsequent, iterateNullableSubsequent: false, allowLazy: true):
1939+
case RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy or RegexNodeKind.Setlazy or RegexNodeKind.Lazyloop when CanBeMadeAtomic(node, subsequent, iterateNullableSubsequent: false, allowLazy: true):
18891940
// The lazy loop doesn't overlap with what comes after it, which means it needs to match as much as its allowed
18901941
// to match in order for there to be a possibility that what comes next matches (if it doesn't match as much
18911942
// as it's allowed and there was still more it could match, then what comes next is guaranteed to not match,
@@ -1901,7 +1952,10 @@ static void ProcessNode(RegexNode node, RegexNode subsequent)
19011952
// allowLazy is set to true so that the implementation will analyze rather than ignore this node; generally lazy nodes
19021953
// are ignored due to making them atomic not generally being a sound change, but here we're explicitly choosing to
19031954
// given the circumstances.
1904-
node.Kind -= RegexNodeKind.Onelazy - RegexNodeKind.Oneloop; // lazy to greedy
1955+
if (node.Kind is not RegexNodeKind.Lazyloop)
1956+
{
1957+
node.Kind -= RegexNodeKind.Onelazy - RegexNodeKind.Oneloop; // lazy to greedy
1958+
}
19051959
node.MakeLoopAtomic();
19061960
break;
19071961

@@ -2193,6 +2247,52 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
21932247
}
21942248
break;
21952249

2250+
case RegexNodeKind.Loop:
2251+
case RegexNodeKind.Lazyloop when allowLazy:
2252+
// With single character loops (e.g. OneLoop, NotOneLoop, SetLoop), we only need to prove there's no overlap between
2253+
// what that single character could be and what comes next. For arbitrary loops, we have more to prove. First, we need
2254+
// to understand what the loop can possibly start with and what it can possibly end with (with a single character loop,
2255+
// those are the same things), and we need to ensure that there's no overlap between those two sets; otherwise, a second
2256+
// iteration of a loop could end up giving back characters that could be consumed by the previous iteration. Second, we need
2257+
// to ensure that neither the starting set nor the ending set overlaps with what could possibly come after it, for the same reason.
2258+
if (RegexPrefixAnalyzer.FindFirstCharClass(node) is not string loopStartingSet ||
2259+
RegexPrefixAnalyzer.FindLastCharClass(node) is not string loopEndingSet ||
2260+
RegexCharClass.MayOverlap(loopStartingSet, loopEndingSet))
2261+
{
2262+
return false;
2263+
}
2264+
2265+
bool CharInStartingOrEndingSet(char ch) =>
2266+
RegexCharClass.CharInClass(ch, loopStartingSet) || RegexCharClass.CharInClass(ch, loopEndingSet);
2267+
2268+
bool MayOverlapStartingOrEndingSet(string set) =>
2269+
RegexCharClass.MayOverlap(set, loopStartingSet) || RegexCharClass.MayOverlap(set, loopEndingSet);
2270+
2271+
switch (subsequent.Kind)
2272+
{
2273+
case RegexNodeKind.One when !CharInStartingOrEndingSet(subsequent.Ch):
2274+
case RegexNodeKind.Set when !MayOverlapStartingOrEndingSet(subsequent.Str!):
2275+
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M > 0 && !CharInStartingOrEndingSet(subsequent.Ch):
2276+
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M > 0 && !MayOverlapStartingOrEndingSet(subsequent.Str!):
2277+
case RegexNodeKind.Multi when !CharInStartingOrEndingSet(subsequent.Str![0]):
2278+
case RegexNodeKind.End:
2279+
case RegexNodeKind.EndZ or RegexNodeKind.Eol when !CharInStartingOrEndingSet('\n'):
2280+
return true;
2281+
2282+
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !CharInStartingOrEndingSet(subsequent.Ch):
2283+
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !MayOverlapStartingOrEndingSet(subsequent.Str!):
2284+
case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(loopStartingSet) && RegexCharClass.IsKnownWordClassSubset(loopEndingSet):
2285+
case RegexNodeKind.NonBoundary when node.M > 0 && (loopStartingSet is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass) && (loopEndingSet is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass):
2286+
case RegexNodeKind.ECMABoundary when node.M > 0 && (loopStartingSet is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass) && (loopEndingSet is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass):
2287+
case RegexNodeKind.NonECMABoundary when node.M > 0 && (loopStartingSet is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass) && (loopEndingSet is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass):
2288+
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
2289+
break;
2290+
2291+
default:
2292+
return false;
2293+
}
2294+
break;
2295+
21962296
default:
21972297
return false;
21982298
}

0 commit comments

Comments
 (0)