-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Further reduce loops around zero-width assertions #118111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
In addition to replacing the loop with an empty when it contains only a zero-width assertion and has a lower bound of 0, if it doesn't have a lower bound of 0, we can just remove the loop layer, replacing it with its child directly, since any additional iterations beyond one is redundant.
@MihuBot regexdiff |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pull Request Overview
This PR optimizes regular expression compilation by further reducing redundant loops around zero-width assertions. The change expands the existing logic that already handled loops with minimum bound 0 to also handle loops with minimum bound greater than 0.
Key changes:
- Separates handling of Empty nodes from zero-width assertions for clarity
- Extends zero-width assertion loop reduction to handle cases where minimum bound > 0
- Removes the minimum bound check restriction, allowing more loop optimizations
Comments suppressed due to low confidence (1)
src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs:924
- The new logic that replaces loops with minimum bound > 0 around zero-width assertions with just the child node needs test coverage to ensure it correctly handles various edge cases, such as different types of zero-width assertions and different minimum bounds.
u = u.M == 0 ? new RegexNode(RegexNodeKind.Empty, Options) : child;
Tagging subscribers to this area: @dotnet/area-system-text-regularexpressions |
3 out of 18857 patterns have generated source code changes. Examples of GeneratedRegex source diffs"(?<=\\b)+demi\\s+douzaine" (106 uses)[GeneratedRegex("(?<=\\b)+demi\\s+douzaine", RegexOptions.IgnoreCase | RegexOptions.Singleline)] /// <code>RegexOptions.IgnoreCase | RegexOptions.Singleline</code><br/>
/// Explanation:<br/>
/// <code>
- /// ○ Loop greedily at least once right-to-left.<br/>
- /// ○ Zero-width positive lookbehind.<br/>
- /// ○ Match if at a word boundary.<br/>
+ /// ○ Zero-width positive lookbehind.<br/>
+ /// ○ Match if at a word boundary.<br/>
/// ○ Match a character in the set [Dd].<br/>
/// ○ Match a character in the set [Ee].<br/>
/// ○ Match a character in the set [Mm].<br/>
// Any possible match is at least 13 characters.
if (pos <= inputSpan.Length - 13)
{
- // The pattern begins with a character in the set [Dd].
- // Find the next occurrence. If it can't be found, there's no match.
- int i = inputSpan.Slice(pos).IndexOfAny('D', 'd');
+ // The pattern has the literal "demi" ordinal case-insensitive at the beginning of the pattern. Find the next occurrence.
+ // If it can't be found, there's no match.
+ int i = inputSpan.Slice(pos).IndexOfAny(Utilities.s_indexOfString_demi_OrdinalIgnoreCase);
if (i >= 0)
{
base.runtextpos = pos + i;
{
int pos = base.runtextpos;
int matchStart = pos;
- int loop_iteration = 0, loop_starting_pos = 0;
- int stackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
- // Loop greedily at least once right-to-left.
- //{
- loop_starting_pos = pos;
- loop_iteration = 0;
-
- LoopBody:
- Utilities.StackPush(ref base.runstack!, ref stackpos, loop_starting_pos, pos);
-
- loop_starting_pos = pos;
- loop_iteration++;
-
- // Zero-width positive lookbehind.
- {
- slice = inputSpan.Slice(pos);
- int positivelookbehind_starting_pos = pos;
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- // Match if at a word boundary.
- if (!Utilities.IsBoundary(inputSpan, pos))
- {
- goto LoopIterationNoMatch;
- }
-
- pos = positivelookbehind_starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
-
- // The loop has a lower bound of 1 but no upper bound. Continue iterating greedily
- // if the last iteration wasn't empty (or if it was, if the lower bound hasn't yet been reached).
- if (pos != loop_starting_pos || loop_iteration == 0)
- {
- goto LoopBody;
- }
- goto LoopEnd;
-
- // The loop iteration failed. Put state back to the way it was before the iteration.
- LoopIterationNoMatch:
- if (--loop_iteration < 0)
- {
- // Unable to match the remainder of the expression after exhausting the loop.
- return false; // The input didn't match.
- }
- Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos);
+ // Zero-width positive lookbehind.
+ {
slice = inputSpan.Slice(pos);
- if (loop_iteration == 0)
+ int positivelookbehind_starting_pos = pos;
+
+ if (Utilities.s_hasTimeout)
+ {
+ base.CheckTimeout();
+ }
+
+ // Match if at a word boundary.
+ if (!Utilities.IsBoundary(inputSpan, pos))
{
- // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
return false; // The input didn't match.
}
- LoopEnd:;
- //}
+ pos = positivelookbehind_starting_pos;
+ slice = inputSpan.Slice(pos);
+ }
if ((uint)slice.Length < 4 ||
!slice.StartsWith("demi", StringComparison.OrdinalIgnoreCase)) // Match the string "demi" (ordinal case-insensitive)
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
// Match a whitespace character atomically at least once.
if (iteration == 0)
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
slice = slice.Slice(iteration);
if ((uint)slice.Length < 8 ||
!slice.StartsWith("douzaine", StringComparison.OrdinalIgnoreCase)) // Match the string "douzaine" (ordinal case-insensitive)
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
// The input matched.
(WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
}
- /// <summary>Pops 2 values from the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
- {
- arg0 = stack[--pos];
- arg1 = stack[--pos];
- }
-
- /// <summary>Pushes 2 values onto the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1)
- {
- // If there's space available for all 2 values, store them.
- int[] s = stack;
- int p = pos;
- if ((uint)(p + 1) < (uint)s.Length)
- {
- s[p] = arg0;
- s[p + 1] = arg1;
- pos += 2;
- return;
- }
-
- // Otherwise, resize the stack to make room and try again.
- WithResize(ref stack, ref pos, arg0, arg1);
-
- // <summary>Resize the backtracking stack array and push 2 values onto the stack.</summary>
- [MethodImpl(MethodImplOptions.NoInlining)]
- static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1)
- {
- Array.Resize(ref stack, (pos + 1) * 2);
- StackPush(ref stack, ref pos, arg0, arg1);
- }
- }
+ /// <summary>Supports searching for the string "demi".</summary>
+ internal static readonly SearchValues<string> s_indexOfString_demi_OrdinalIgnoreCase = SearchValues.Create(["demi"], StringComparison.OrdinalIgnoreCase);
}
} "^((?!-)+)([a-zA-Z_-]+$).*((?<!-)+)" (57 uses)[GeneratedRegex("^((?!-)+)([a-zA-Z_-]+$).*((?<!-)+)", RegexOptions.Singleline)] /// <code>
/// ○ Match if at the beginning of the string.<br/>
/// ○ 1st capture group.<br/>
- /// ○ Loop greedily at least once.<br/>
- /// ○ Zero-width negative lookahead.<br/>
- /// ○ Match '-'.<br/>
+ /// ○ Zero-width negative lookahead.<br/>
+ /// ○ Match '-'.<br/>
/// ○ 2nd capture group.<br/>
/// ○ Match a character in the set [\-A-Z_a-z] atomically at least once.<br/>
/// ○ Match if at the end of the string or if before an ending newline.<br/>
/// ○ Match any character greedily any number of times.<br/>
/// ○ 3rd capture group.<br/>
- /// ○ Loop greedily and atomically at least once right-to-left.<br/>
- /// ○ Zero-width negative lookbehind.<br/>
- /// ○ Match '-' right-to-left.<br/>
+ /// ○ Zero-width negative lookbehind.<br/>
+ /// ○ Match '-' right-to-left.<br/>
/// </code>
/// </remarks>
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
int capture_starting_pos2 = 0;
int charloop_capture_pos = 0;
int charloop_starting_pos = 0, charloop_ending_pos = 0;
- int loop_iteration = 0, loop_starting_pos = 0;
- int loop_iteration1 = 0, loop_starting_pos1 = 0;
- int stackpos = 0;
- int startingStackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
// Match if at the beginning of the string.
}
// 1st capture group.
- //{
+ {
capture_starting_pos = pos;
- // Loop greedily at least once.
- //{
- loop_starting_pos = pos;
- loop_iteration = 0;
-
- LoopBody:
- Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), loop_starting_pos, pos);
-
- loop_starting_pos = pos;
- loop_iteration++;
-
- // Zero-width negative lookahead.
- {
- slice = inputSpan.Slice(pos);
- int negativelookahead__starting_pos = pos;
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- // Match '-'.
- if (slice.IsEmpty || slice[0] != '-')
- {
- goto NegativeLookaroundMatch;
- }
-
- goto LoopIterationNoMatch;
-
- NegativeLookaroundMatch:
- pos = negativelookahead__starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
-
- // The loop has a lower bound of 1 but no upper bound. Continue iterating greedily
- // if the last iteration wasn't empty (or if it was, if the lower bound hasn't yet been reached).
- if (pos != loop_starting_pos || loop_iteration == 0)
- {
- goto LoopBody;
- }
- goto LoopEnd;
-
- // The loop iteration failed. Put state back to the way it was before the iteration.
- LoopIterationNoMatch:
- if (--loop_iteration < 0)
- {
- // Unable to match the remainder of the expression after exhausting the loop.
- UncaptureUntil(0);
- return false; // The input didn't match.
- }
- Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos);
- UncaptureUntil(base.runstack![--stackpos]);
+ // Zero-width negative lookahead.
+ {
slice = inputSpan.Slice(pos);
- if (loop_iteration == 0)
+ int negativelookahead__starting_pos = pos;
+ if (Utilities.s_hasTimeout)
{
- // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
- UncaptureUntil(0);
- return false; // The input didn't match.
+ base.CheckTimeout();
}
- LoopEnd:;
- //}
+ // Match '-'.
+ if (slice.IsEmpty || slice[0] != '-')
+ {
+ goto NegativeLookaroundMatch;
+ }
+
+ UncaptureUntil(0);
+ return false; // The input didn't match.
+
+ NegativeLookaroundMatch:
+ pos = negativelookahead__starting_pos;
+ slice = inputSpan.Slice(pos);
+ }
base.Capture(1, capture_starting_pos, pos);
-
- goto CaptureSkipBacktrack;
-
- CaptureBacktrack:
- goto LoopIterationNoMatch;
-
- CaptureSkipBacktrack:;
- //}
+ }
// 2nd capture group.
{
if (iteration == 0)
{
- goto CaptureBacktrack;
+ UncaptureUntil(0);
+ return false; // The input didn't match.
}
slice = slice.Slice(iteration);
// Match if at the end of the string or if before an ending newline.
if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
{
- goto CaptureBacktrack;
+ UncaptureUntil(0);
+ return false; // The input didn't match.
}
base.Capture(2, capture_starting_pos1, pos);
if (charloop_starting_pos >= charloop_ending_pos)
{
- goto CaptureBacktrack;
+ UncaptureUntil(0);
+ return false; // The input didn't match.
}
pos = --charloop_ending_pos;
slice = inputSpan.Slice(pos);
{
capture_starting_pos2 = pos;
- // Loop greedily and atomically at least once right-to-left.
+ // Zero-width negative lookbehind.
{
- startingStackpos = stackpos;
- loop_starting_pos1 = pos;
- loop_iteration1 = 0;
-
- LoopBody1:
- Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), loop_starting_pos1, pos);
-
- loop_starting_pos1 = pos;
- loop_iteration1++;
-
- // Zero-width negative lookbehind.
- {
- slice = inputSpan.Slice(pos);
- int negativelookbehind__starting_pos = pos;
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- // Match '-' right-to-left.
- if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '-')
- {
- goto NegativeLookaroundMatch1;
- }
- pos--;
-
- goto LoopIterationNoMatch1;
-
- NegativeLookaroundMatch1:
- pos = negativelookbehind__starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
-
- // The loop has a lower bound of 1 but no upper bound. Continue iterating greedily
- // if the last iteration wasn't empty (or if it was, if the lower bound hasn't yet been reached).
- if (pos != loop_starting_pos1 || loop_iteration1 == 0)
- {
- goto LoopBody1;
- }
- goto LoopEnd1;
-
- // The loop iteration failed. Put state back to the way it was before the iteration.
- LoopIterationNoMatch1:
- if (--loop_iteration1 < 0)
- {
- // Unable to match the remainder of the expression after exhausting the loop.
- goto CharLoopBacktrack;
- }
- Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos1);
- UncaptureUntil(base.runstack![--stackpos]);
slice = inputSpan.Slice(pos);
- if (loop_iteration1 == 0)
+ int negativelookbehind__starting_pos = pos;
+ if (Utilities.s_hasTimeout)
{
- // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
- goto CharLoopBacktrack;
+ base.CheckTimeout();
}
- LoopEnd1:
- stackpos = startingStackpos; // Ensure any remaining backtracking state is removed.
+ // Match '-' right-to-left.
+ if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '-')
+ {
+ goto NegativeLookaroundMatch1;
+ }
+ pos--;
+
+ goto CharLoopBacktrack;
+
+ NegativeLookaroundMatch1:
+ pos = negativelookbehind__starting_pos;
+ slice = inputSpan.Slice(pos);
}
base.Capture(3, capture_starting_pos2, pos);
/// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
- /// <summary>Pops 2 values from the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
- {
- arg0 = stack[--pos];
- arg1 = stack[--pos];
- }
-
- /// <summary>Pushes 3 values onto the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
- {
- // If there's space available for all 3 values, store them.
- int[] s = stack;
- int p = pos;
- if ((uint)(p + 2) < (uint)s.Length)
- {
- s[p] = arg0;
- s[p + 1] = arg1;
- s[p + 2] = arg2;
- pos += 3;
- return;
- }
-
- // Otherwise, resize the stack to make room and try again.
- WithResize(ref stack, ref pos, arg0, arg1, arg2);
-
- // <summary>Resize the backtracking stack array and push 3 values onto the stack.</summary>
- [MethodImpl(MethodImplOptions.NoInlining)]
- static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
- {
- Array.Resize(ref stack, (pos + 2) * 2);
- StackPush(ref stack, ref pos, arg0, arg1, arg2);
- }
- }
-
/// <summary>Supports searching for characters in or not in "-ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz".</summary>
internal static readonly SearchValues<char> s_ascii_200000FEFFFF87FEFFFF07 = SearchValues.Create("-ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz");
} For more diff examples, see https://gist.github.com/MihuBot/a15ee2958fd2e1688755cccbc6104e54 JIT assembly changes
For a list of JIT diff regressions, see Regressions.md Sample source code for further analysisconst string JsonPath = "RegexResults-1306.json";
if (!File.Exists(JsonPath))
{
await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2r7wB2A");
using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}
using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");
record KnownPattern(string Pattern, RegexOptions Options, int Count);
sealed class RegexEntry
{
public required KnownPattern Regex { get; set; }
public required string MainSource { get; set; }
public required string PrSource { get; set; }
public string? FullDiff { get; set; }
public string? ShortDiff { get; set; }
public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
} |
@MihuBot regexdiff |
9 out of 18857 patterns have generated source code changes. Examples of GeneratedRegex source diffs"(?<=\\b)+demi\\s+douzaine" (106 uses)[GeneratedRegex("(?<=\\b)+demi\\s+douzaine", RegexOptions.IgnoreCase | RegexOptions.Singleline)] /// <code>RegexOptions.IgnoreCase | RegexOptions.Singleline</code><br/>
/// Explanation:<br/>
/// <code>
- /// ○ Loop greedily at least once right-to-left.<br/>
- /// ○ Zero-width positive lookbehind.<br/>
- /// ○ Match if at a word boundary.<br/>
+ /// ○ Zero-width positive lookbehind.<br/>
+ /// ○ Match if at a word boundary.<br/>
/// ○ Match a character in the set [Dd].<br/>
/// ○ Match a character in the set [Ee].<br/>
/// ○ Match a character in the set [Mm].<br/>
// Any possible match is at least 13 characters.
if (pos <= inputSpan.Length - 13)
{
- // The pattern begins with a character in the set [Dd].
- // Find the next occurrence. If it can't be found, there's no match.
- int i = inputSpan.Slice(pos).IndexOfAny('D', 'd');
+ // The pattern has the literal "demi" ordinal case-insensitive at the beginning of the pattern. Find the next occurrence.
+ // If it can't be found, there's no match.
+ int i = inputSpan.Slice(pos).IndexOfAny(Utilities.s_indexOfString_demi_OrdinalIgnoreCase);
if (i >= 0)
{
base.runtextpos = pos + i;
{
int pos = base.runtextpos;
int matchStart = pos;
- int loop_iteration = 0, loop_starting_pos = 0;
- int stackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
- // Loop greedily at least once right-to-left.
- //{
- loop_starting_pos = pos;
- loop_iteration = 0;
-
- LoopBody:
- Utilities.StackPush(ref base.runstack!, ref stackpos, loop_starting_pos, pos);
-
- loop_starting_pos = pos;
- loop_iteration++;
-
- // Zero-width positive lookbehind.
- {
- slice = inputSpan.Slice(pos);
- int positivelookbehind_starting_pos = pos;
-
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- // Match if at a word boundary.
- if (!Utilities.IsBoundary(inputSpan, pos))
- {
- goto LoopIterationNoMatch;
- }
-
- pos = positivelookbehind_starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
-
- // The loop has a lower bound of 1 but no upper bound. Continue iterating greedily
- // if the last iteration wasn't empty (or if it was, if the lower bound hasn't yet been reached).
- if (pos != loop_starting_pos || loop_iteration == 0)
- {
- goto LoopBody;
- }
- goto LoopEnd;
-
- // The loop iteration failed. Put state back to the way it was before the iteration.
- LoopIterationNoMatch:
- if (--loop_iteration < 0)
- {
- // Unable to match the remainder of the expression after exhausting the loop.
- return false; // The input didn't match.
- }
- Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos);
+ // Zero-width positive lookbehind.
+ {
slice = inputSpan.Slice(pos);
- if (loop_iteration == 0)
+ int positivelookbehind_starting_pos = pos;
+
+ if (Utilities.s_hasTimeout)
+ {
+ base.CheckTimeout();
+ }
+
+ // Match if at a word boundary.
+ if (!Utilities.IsBoundary(inputSpan, pos))
{
- // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
return false; // The input didn't match.
}
- LoopEnd:;
- //}
+ pos = positivelookbehind_starting_pos;
+ slice = inputSpan.Slice(pos);
+ }
if ((uint)slice.Length < 4 ||
!slice.StartsWith("demi", StringComparison.OrdinalIgnoreCase)) // Match the string "demi" (ordinal case-insensitive)
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
// Match a whitespace character atomically at least once.
if (iteration == 0)
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
slice = slice.Slice(iteration);
if ((uint)slice.Length < 8 ||
!slice.StartsWith("douzaine", StringComparison.OrdinalIgnoreCase)) // Match the string "douzaine" (ordinal case-insensitive)
{
- goto LoopIterationNoMatch;
+ return false; // The input didn't match.
}
// The input matched.
(WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
}
- /// <summary>Pops 2 values from the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
- {
- arg0 = stack[--pos];
- arg1 = stack[--pos];
- }
-
- /// <summary>Pushes 2 values onto the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1)
- {
- // If there's space available for all 2 values, store them.
- int[] s = stack;
- int p = pos;
- if ((uint)(p + 1) < (uint)s.Length)
- {
- s[p] = arg0;
- s[p + 1] = arg1;
- pos += 2;
- return;
- }
-
- // Otherwise, resize the stack to make room and try again.
- WithResize(ref stack, ref pos, arg0, arg1);
-
- // <summary>Resize the backtracking stack array and push 2 values onto the stack.</summary>
- [MethodImpl(MethodImplOptions.NoInlining)]
- static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1)
- {
- Array.Resize(ref stack, (pos + 1) * 2);
- StackPush(ref stack, ref pos, arg0, arg1);
- }
- }
+ /// <summary>Supports searching for the string "demi".</summary>
+ internal static readonly SearchValues<string> s_indexOfString_demi_OrdinalIgnoreCase = SearchValues.Create(["demi"], StringComparison.OrdinalIgnoreCase);
}
} "measles$/" (74 uses)[GeneratedRegex("measles$/", RegexOptions.IgnoreCase)] // Any possible match is at least 8 characters.
if (pos <= inputSpan.Length - 8)
{
- // The pattern has the literal "measles" ordinal case-insensitive at the beginning of the pattern. Find the next occurrence.
+ // The pattern has the literal "measles/" ordinal case-insensitive at the beginning of the pattern. Find the next occurrence.
// If it can't be found, there's no match.
- int i = inputSpan.Slice(pos).IndexOfAny(Utilities.s_indexOfString_measles_OrdinalIgnoreCase);
+ int i = inputSpan.Slice(pos).IndexOfAny(Utilities.s_indexOfString_5DE5CF41BC14B96979EE7612269C6B4F3E8639BAE77421938F36FC03C4D4DE27);
if (i >= 0)
{
base.runtextpos = pos + i;
/// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
- /// <summary>Supports searching for the string "measles".</summary>
- internal static readonly SearchValues<string> s_indexOfString_measles_OrdinalIgnoreCase = SearchValues.Create(["measles"], StringComparison.OrdinalIgnoreCase);
+ /// <summary>Supports searching for the string "measles/".</summary>
+ internal static readonly SearchValues<string> s_indexOfString_5DE5CF41BC14B96979EE7612269C6B4F3E8639BAE77421938F36FC03C4D4DE27 = SearchValues.Create(["measles/"], StringComparison.OrdinalIgnoreCase);
}
} "^((?!-)+)([a-zA-Z_-]+$).*((?<!-)+)" (57 uses)[GeneratedRegex("^((?!-)+)([a-zA-Z_-]+$).*((?<!-)+)", RegexOptions.Singleline)] /// <code>
/// ○ Match if at the beginning of the string.<br/>
/// ○ 1st capture group.<br/>
- /// ○ Loop greedily at least once.<br/>
- /// ○ Zero-width negative lookahead.<br/>
- /// ○ Match '-'.<br/>
+ /// ○ Zero-width negative lookahead.<br/>
+ /// ○ Match '-'.<br/>
/// ○ 2nd capture group.<br/>
/// ○ Match a character in the set [\-A-Z_a-z] atomically at least once.<br/>
/// ○ Match if at the end of the string or if before an ending newline.<br/>
/// ○ Match any character greedily any number of times.<br/>
/// ○ 3rd capture group.<br/>
- /// ○ Loop greedily and atomically at least once right-to-left.<br/>
- /// ○ Zero-width negative lookbehind.<br/>
- /// ○ Match '-' right-to-left.<br/>
+ /// ○ Zero-width negative lookbehind.<br/>
+ /// ○ Match '-' right-to-left.<br/>
/// </code>
/// </remarks>
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "42.42.42.42")]
int capture_starting_pos2 = 0;
int charloop_capture_pos = 0;
int charloop_starting_pos = 0, charloop_ending_pos = 0;
- int loop_iteration = 0, loop_starting_pos = 0;
- int loop_iteration1 = 0, loop_starting_pos1 = 0;
- int stackpos = 0;
- int startingStackpos = 0;
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
// Match if at the beginning of the string.
}
// 1st capture group.
- //{
+ {
capture_starting_pos = pos;
- // Loop greedily at least once.
- //{
- loop_starting_pos = pos;
- loop_iteration = 0;
-
- LoopBody:
- Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), loop_starting_pos, pos);
-
- loop_starting_pos = pos;
- loop_iteration++;
-
- // Zero-width negative lookahead.
- {
- slice = inputSpan.Slice(pos);
- int negativelookahead__starting_pos = pos;
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- // Match '-'.
- if (slice.IsEmpty || slice[0] != '-')
- {
- goto NegativeLookaroundMatch;
- }
-
- goto LoopIterationNoMatch;
-
- NegativeLookaroundMatch:
- pos = negativelookahead__starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
-
- // The loop has a lower bound of 1 but no upper bound. Continue iterating greedily
- // if the last iteration wasn't empty (or if it was, if the lower bound hasn't yet been reached).
- if (pos != loop_starting_pos || loop_iteration == 0)
- {
- goto LoopBody;
- }
- goto LoopEnd;
-
- // The loop iteration failed. Put state back to the way it was before the iteration.
- LoopIterationNoMatch:
- if (--loop_iteration < 0)
- {
- // Unable to match the remainder of the expression after exhausting the loop.
- UncaptureUntil(0);
- return false; // The input didn't match.
- }
- Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos);
- UncaptureUntil(base.runstack![--stackpos]);
+ // Zero-width negative lookahead.
+ {
slice = inputSpan.Slice(pos);
- if (loop_iteration == 0)
+ int negativelookahead__starting_pos = pos;
+ if (Utilities.s_hasTimeout)
{
- // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
- UncaptureUntil(0);
- return false; // The input didn't match.
+ base.CheckTimeout();
}
- LoopEnd:;
- //}
+ // Match '-'.
+ if (slice.IsEmpty || slice[0] != '-')
+ {
+ goto NegativeLookaroundMatch;
+ }
+
+ UncaptureUntil(0);
+ return false; // The input didn't match.
+
+ NegativeLookaroundMatch:
+ pos = negativelookahead__starting_pos;
+ slice = inputSpan.Slice(pos);
+ }
base.Capture(1, capture_starting_pos, pos);
-
- goto CaptureSkipBacktrack;
-
- CaptureBacktrack:
- goto LoopIterationNoMatch;
-
- CaptureSkipBacktrack:;
- //}
+ }
// 2nd capture group.
{
if (iteration == 0)
{
- goto CaptureBacktrack;
+ UncaptureUntil(0);
+ return false; // The input didn't match.
}
slice = slice.Slice(iteration);
// Match if at the end of the string or if before an ending newline.
if (pos < inputSpan.Length - 1 || ((uint)pos < (uint)inputSpan.Length && inputSpan[pos] != '\n'))
{
- goto CaptureBacktrack;
+ UncaptureUntil(0);
+ return false; // The input didn't match.
}
base.Capture(2, capture_starting_pos1, pos);
if (charloop_starting_pos >= charloop_ending_pos)
{
- goto CaptureBacktrack;
+ UncaptureUntil(0);
+ return false; // The input didn't match.
}
pos = --charloop_ending_pos;
slice = inputSpan.Slice(pos);
{
capture_starting_pos2 = pos;
- // Loop greedily and atomically at least once right-to-left.
+ // Zero-width negative lookbehind.
{
- startingStackpos = stackpos;
- loop_starting_pos1 = pos;
- loop_iteration1 = 0;
-
- LoopBody1:
- Utilities.StackPush(ref base.runstack!, ref stackpos, base.Crawlpos(), loop_starting_pos1, pos);
-
- loop_starting_pos1 = pos;
- loop_iteration1++;
-
- // Zero-width negative lookbehind.
- {
- slice = inputSpan.Slice(pos);
- int negativelookbehind__starting_pos = pos;
- if (Utilities.s_hasTimeout)
- {
- base.CheckTimeout();
- }
-
- // Match '-' right-to-left.
- if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '-')
- {
- goto NegativeLookaroundMatch1;
- }
- pos--;
-
- goto LoopIterationNoMatch1;
-
- NegativeLookaroundMatch1:
- pos = negativelookbehind__starting_pos;
- slice = inputSpan.Slice(pos);
- }
-
-
- // The loop has a lower bound of 1 but no upper bound. Continue iterating greedily
- // if the last iteration wasn't empty (or if it was, if the lower bound hasn't yet been reached).
- if (pos != loop_starting_pos1 || loop_iteration1 == 0)
- {
- goto LoopBody1;
- }
- goto LoopEnd1;
-
- // The loop iteration failed. Put state back to the way it was before the iteration.
- LoopIterationNoMatch1:
- if (--loop_iteration1 < 0)
- {
- // Unable to match the remainder of the expression after exhausting the loop.
- goto CharLoopBacktrack;
- }
- Utilities.StackPop(base.runstack!, ref stackpos, out pos, out loop_starting_pos1);
- UncaptureUntil(base.runstack![--stackpos]);
slice = inputSpan.Slice(pos);
- if (loop_iteration1 == 0)
+ int negativelookbehind__starting_pos = pos;
+ if (Utilities.s_hasTimeout)
{
- // All possible iterations have matched, but it's below the required minimum of 1. Fail the loop.
- goto CharLoopBacktrack;
+ base.CheckTimeout();
}
- LoopEnd1:
- stackpos = startingStackpos; // Ensure any remaining backtracking state is removed.
+ // Match '-' right-to-left.
+ if ((uint)(pos - 1) >= inputSpan.Length || inputSpan[pos - 1] != '-')
+ {
+ goto NegativeLookaroundMatch1;
+ }
+ pos--;
+
+ goto CharLoopBacktrack;
+
+ NegativeLookaroundMatch1:
+ pos = negativelookbehind__starting_pos;
+ slice = inputSpan.Slice(pos);
}
base.Capture(3, capture_starting_pos2, pos);
/// <summary>Whether <see cref="s_defaultTimeout"/> is non-infinite.</summary>
internal static readonly bool s_hasTimeout = s_defaultTimeout != Regex.InfiniteMatchTimeout;
- /// <summary>Pops 2 values from the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPop(int[] stack, ref int pos, out int arg0, out int arg1)
- {
- arg0 = stack[--pos];
- arg1 = stack[--pos];
- }
-
- /// <summary>Pushes 3 values onto the backtracking stack.</summary>
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- internal static void StackPush(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
- {
- // If there's space available for all 3 values, store them.
- int[] s = stack;
- int p = pos;
- if ((uint)(p + 2) < (uint)s.Length)
- {
- s[p] = arg0;
- s[p + 1] = arg1;
- s[p + 2] = arg2;
- pos += 3;
- return;
- }
-
- // Otherwise, resize the stack to make room and try again.
- WithResize(ref stack, ref pos, arg0, arg1, arg2);
-
- // <summary>Resize the backtracking stack array and push 3 values onto the stack.</summary>
- [MethodImpl(MethodImplOptions.NoInlining)]
- static void WithResize(ref int[] stack, ref int pos, int arg0, int arg1, int arg2)
- {
- Array.Resize(ref stack, (pos + 2) * 2);
- StackPush(ref stack, ref pos, arg0, arg1, arg2);
- }
- }
-
/// <summary>Supports searching for characters in or not in "-ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz".</summary>
internal static readonly SearchValues<char> s_ascii_200000FEFFFF87FEFFFF07 = SearchValues.Create("-ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz");
} For more diff examples, see https://gist.github.com/MihuBot/567aa328619b6cde02f2cd78c49018bd JIT assembly changes
For a list of JIT diff regressions, see Regressions.md Sample source code for further analysisconst string JsonPath = "RegexResults-1309.json";
if (!File.Exists(JsonPath))
{
await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/E2ukaNOA");
using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}
using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");
record KnownPattern(string Pattern, RegexOptions Options, int Count);
sealed class RegexEntry
{
public required KnownPattern Regex { get; set; }
public required string MainSource { get; set; }
public required string PrSource { get; set; }
public string? FullDiff { get; set; }
public string? ShortDiff { get; set; }
public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
} |
In addition to replacing the loop with an empty when it contains only a zero-width assertion and has a lower bound of 0, if it doesn't have a lower bound of 0, we can just remove the loop layer, replacing it with its child directly, since any additional iterations beyond one is redundant.