Skip to content

Commit 44eaa2e

Browse files
Remove the abstraction for token counting from the main evaluation API (#6320)
This change is being made because there is still some uncertainty around what a general-purpose token counting abstraction (that supports all kinds of future models, and all kinds of input modalities) should look like at the moment. We do not want to bake in an API that only supports text-based inputs for the models and use cases that are prevalent today, since it would be a potential breaking change to change this API after we release a stable version of the evaluation APIs. We can always reintroduce the token counting support in a non-breaking fashion in the future if and when there is more clarity on what a general-purpose token counting abstraction should look like, or if and when a general-purpose token counting abstraction is introduced in a lower layer (Microsoft.Extensions.AI) in the future. In the meanwhile, callers should still be able to use the `Microsoft.ML.Tokenizers` library directly to count tokens in text-based content and trim down the conversation history before calling `EvaluateAsync()` if needed. Fixes #6234
1 parent e5b6b02 commit 44eaa2e

26 files changed

+71
-336
lines changed

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/ChatConversationEvaluator.cs

Lines changed: 16 additions & 183 deletions
Original file line numberDiff line numberDiff line change
@@ -55,100 +55,8 @@ public virtual async ValueTask<EvaluationResult> EvaluateAsync(
5555
return result;
5656
}
5757

58-
(ChatMessage? userRequest, List<ChatMessage> history) = GetUserRequestAndHistory(messages);
59-
60-
int inputTokenLimit = 0;
61-
int ignoredMessagesCount = 0;
62-
63-
if (chatConfiguration.TokenCounter is not null)
64-
{
65-
IEvaluationTokenCounter tokenCounter = chatConfiguration.TokenCounter;
66-
inputTokenLimit = tokenCounter.InputTokenLimit;
67-
int tokenBudget = inputTokenLimit;
68-
69-
void OnTokenBudgetExceeded()
70-
{
71-
EvaluationDiagnostic tokenBudgetExceeded =
72-
EvaluationDiagnostic.Error(
73-
$"Evaluation failed because the specified limit of {inputTokenLimit} input tokens was exceeded.");
74-
75-
result.AddDiagnosticsToAllMetrics(tokenBudgetExceeded);
76-
}
77-
78-
if (!string.IsNullOrWhiteSpace(SystemPrompt))
79-
{
80-
tokenBudget -= tokenCounter.CountTokens(SystemPrompt!);
81-
if (tokenBudget < 0)
82-
{
83-
OnTokenBudgetExceeded();
84-
return result;
85-
}
86-
}
87-
88-
string baseEvaluationPrompt =
89-
await RenderEvaluationPromptAsync(
90-
userRequest,
91-
modelResponse,
92-
includedHistory: [],
93-
additionalContext,
94-
cancellationToken).ConfigureAwait(false);
95-
96-
tokenBudget -= tokenCounter.CountTokens(baseEvaluationPrompt);
97-
if (tokenBudget < 0)
98-
{
99-
OnTokenBudgetExceeded();
100-
return result;
101-
}
102-
103-
if (history.Count > 0 && !IgnoresHistory)
104-
{
105-
if (history.Count == 1)
106-
{
107-
(bool canRender, tokenBudget) =
108-
await CanRenderAsync(
109-
history[0],
110-
tokenBudget,
111-
chatConfiguration,
112-
cancellationToken).ConfigureAwait(false);
113-
114-
if (!canRender)
115-
{
116-
ignoredMessagesCount = 1;
117-
history = [];
118-
}
119-
}
120-
else
121-
{
122-
int totalMessagesCount = history.Count;
123-
int includedMessagesCount = 0;
124-
125-
history.Reverse();
126-
127-
foreach (ChatMessage message in history)
128-
{
129-
cancellationToken.ThrowIfCancellationRequested();
130-
131-
(bool canRender, tokenBudget) =
132-
await CanRenderAsync(
133-
message,
134-
tokenBudget,
135-
chatConfiguration,
136-
cancellationToken).ConfigureAwait(false);
137-
138-
if (!canRender)
139-
{
140-
ignoredMessagesCount = totalMessagesCount - includedMessagesCount;
141-
history.RemoveRange(index: includedMessagesCount, count: ignoredMessagesCount);
142-
break;
143-
}
144-
145-
includedMessagesCount++;
146-
}
147-
148-
history.Reverse();
149-
}
150-
}
151-
}
58+
(ChatMessage? userRequest, List<ChatMessage> conversationHistory) =
59+
GetUserRequestAndConversationHistory(messages);
15260

15361
var evaluationMessages = new List<ChatMessage>();
15462
if (!string.IsNullOrWhiteSpace(SystemPrompt))
@@ -160,7 +68,7 @@ await CanRenderAsync(
16068
await RenderEvaluationPromptAsync(
16169
userRequest,
16270
modelResponse,
163-
includedHistory: history,
71+
conversationHistory,
16472
additionalContext,
16573
cancellationToken).ConfigureAwait(false);
16674

@@ -172,84 +80,9 @@ await PerformEvaluationAsync(
17280
result,
17381
cancellationToken).ConfigureAwait(false);
17482

175-
if (inputTokenLimit > 0 && ignoredMessagesCount > 0)
176-
{
177-
#pragma warning disable S103 // Lines should not be too long
178-
result.AddDiagnosticsToAllMetrics(
179-
EvaluationDiagnostic.Warning(
180-
$"The evaluation may be inconclusive because the oldest {ignoredMessagesCount} messages in the supplied conversation history were ignored in order to stay under the specified limit of {inputTokenLimit} input tokens."));
181-
#pragma warning restore S103
182-
}
183-
18483
return result;
18584
}
18685

187-
/// <summary>
188-
/// Determines if there is sufficient <paramref name="tokenBudget"/> remaining to render the
189-
/// supplied <paramref name="message"/> as part of the evaluation prompt that this <see cref="IEvaluator"/> uses.
190-
/// </summary>
191-
/// <param name="message">
192-
/// A message that is part of the conversation history for the response being evaluated and that is to be rendered
193-
/// as part of the evaluation prompt.
194-
/// </param>
195-
/// <param name="tokenBudget">
196-
/// The number of tokens available for the rendering additional content as part of the evaluation prompt.
197-
/// </param>
198-
/// <param name="chatConfiguration">
199-
/// A <see cref="ChatConfiguration"/> that specifies the <see cref="IChatClient"/> and the
200-
/// <see cref="IEvaluationTokenCounter"/> that this <see cref="IEvaluator"/> uses to perform the evaluation.
201-
/// </param>
202-
/// <param name="cancellationToken">A <see cref="CancellationToken"/> that can cancel the operation.</param>
203-
/// <returns>
204-
/// A tuple containing a <see langword="bool"/> indicating whether there is sufficient
205-
/// <paramref name="tokenBudget"/> remaining to render the supplied <paramref name="message"/> as part of the
206-
/// evaluation prompt, and an <see langword="int"/> containing the remaining token budget that would be available
207-
/// once this <paramref name="message"/> is rendered.
208-
/// </returns>
209-
protected virtual ValueTask<(bool canRender, int remainingTokenBudget)> CanRenderAsync(
210-
ChatMessage message,
211-
int tokenBudget,
212-
ChatConfiguration chatConfiguration,
213-
CancellationToken cancellationToken)
214-
{
215-
_ = Throw.IfNull(message);
216-
_ = Throw.IfNull(chatConfiguration);
217-
218-
IEvaluationTokenCounter? tokenCounter = chatConfiguration.TokenCounter;
219-
if (tokenCounter is null)
220-
{
221-
return new ValueTask<(bool, int)>((true, tokenBudget));
222-
}
223-
224-
string? author = message.AuthorName;
225-
string role = message.Role.Value;
226-
string content = message.Text ?? string.Empty;
227-
228-
int tokenCount =
229-
string.IsNullOrWhiteSpace(author)
230-
? tokenCounter.CountTokens("[") +
231-
tokenCounter.CountTokens(role) +
232-
tokenCounter.CountTokens("] ") +
233-
tokenCounter.CountTokens(content) +
234-
tokenCounter.CountTokens("\n")
235-
: tokenCounter.CountTokens("[") +
236-
tokenCounter.CountTokens(author!) +
237-
tokenCounter.CountTokens(" (") +
238-
tokenCounter.CountTokens(role) +
239-
tokenCounter.CountTokens(")] ") +
240-
tokenCounter.CountTokens(content) +
241-
tokenCounter.CountTokens("\n");
242-
243-
if (tokenCount > tokenBudget)
244-
{
245-
return new ValueTask<(bool, int)>((false, tokenBudget));
246-
}
247-
else
248-
{
249-
return new ValueTask<(bool, int)>((true, tokenBudget - tokenCount));
250-
}
251-
}
252-
25386
/// <summary>
25487
/// Renders the supplied <paramref name="response"/> to a string that can be included as part of the evaluation
25588
/// prompt that this <see cref="IEvaluator"/> uses.
@@ -313,21 +146,21 @@ protected virtual ValueTask<string> RenderAsync(ChatMessage message, Cancellatio
313146
/// The request that produced the <paramref name="modelResponse"/> that is to be evaluated.
314147
/// </param>
315148
/// <param name="modelResponse">The response that is to be evaluated.</param>
316-
/// <param name="includedHistory">
149+
/// <param name="conversationHistory">
317150
/// The conversation history (excluding the <paramref name="userRequest"/> and <paramref name="modelResponse"/>)
318151
/// that is to be included as part of the evaluation prompt.
319152
/// </param>
320153
/// <param name="additionalContext">
321154
/// Additional contextual information (beyond that which is available in the <paramref name="userRequest"/> and
322-
/// <paramref name="includedHistory"/>) that this <see cref="IEvaluator"/> may need to accurately evaluate the
155+
/// <paramref name="conversationHistory"/>) that this <see cref="IEvaluator"/> may need to accurately evaluate the
323156
/// supplied <paramref name="modelResponse"/>.
324157
/// </param>
325158
/// <param name="cancellationToken">A <see cref="CancellationToken"/> that can cancel the operation.</param>
326159
/// <returns>The evaluation prompt.</returns>
327160
protected abstract ValueTask<string> RenderEvaluationPromptAsync(
328161
ChatMessage? userRequest,
329162
ChatResponse modelResponse,
330-
IEnumerable<ChatMessage>? includedHistory,
163+
IEnumerable<ChatMessage>? conversationHistory,
331164
IEnumerable<EvaluationContext>? additionalContext,
332165
CancellationToken cancellationToken);
333166

@@ -351,8 +184,8 @@ protected abstract ValueTask<string> RenderEvaluationPromptAsync(
351184
/// <see cref="EvaluationMetric"/>s in the supplied <paramref name="result"/>.
352185
/// </summary>
353186
/// <param name="chatConfiguration">
354-
/// A <see cref="ChatConfiguration"/> that specifies the <see cref="IChatClient"/> and the
355-
/// <see cref="IEvaluationTokenCounter"/> that this <see cref="IEvaluator"/> uses to perform the evaluation.
187+
/// A <see cref="ChatConfiguration"/> that specifies the <see cref="IChatClient"/> that should be used if one or
188+
/// more composed <see cref="IEvaluator"/>s use an AI model to perform evaluation.
356189
/// </param>
357190
/// <param name="evaluationMessages">
358191
/// The set of messages that are to be sent to the supplied <see cref="ChatConfiguration.ChatClient"/> to perform
@@ -370,11 +203,11 @@ protected abstract ValueTask PerformEvaluationAsync(
370203
EvaluationResult result,
371204
CancellationToken cancellationToken);
372205

373-
private (ChatMessage? userRequest, List<ChatMessage> history) GetUserRequestAndHistory(
206+
private (ChatMessage? userRequest, List<ChatMessage> conversationHistory) GetUserRequestAndConversationHistory(
374207
IEnumerable<ChatMessage> messages)
375208
{
376209
ChatMessage? userRequest = null;
377-
List<ChatMessage> history;
210+
List<ChatMessage> conversationHistory;
378211

379212
if (IgnoresHistory)
380213
{
@@ -383,22 +216,22 @@ protected abstract ValueTask PerformEvaluationAsync(
383216
? lastMessage
384217
: null;
385218

386-
history = [];
219+
conversationHistory = [];
387220
}
388221
else
389222
{
390-
history = [.. messages];
391-
int lastMessageIndex = history.Count - 1;
223+
conversationHistory = [.. messages];
224+
int lastMessageIndex = conversationHistory.Count - 1;
392225

393226
if (lastMessageIndex >= 0 &&
394-
history[lastMessageIndex] is ChatMessage lastMessage &&
227+
conversationHistory[lastMessageIndex] is ChatMessage lastMessage &&
395228
lastMessage.Role == ChatRole.User)
396229
{
397230
userRequest = lastMessage;
398-
history.RemoveAt(lastMessageIndex);
231+
conversationHistory.RemoveAt(lastMessageIndex);
399232
}
400233
}
401234

402-
return (userRequest, history);
235+
return (userRequest, conversationHistory);
403236
}
404237
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/CoherenceEvaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ public sealed class CoherenceEvaluator : SingleNumericMetricEvaluator
5050
protected override async ValueTask<string> RenderEvaluationPromptAsync(
5151
ChatMessage? userRequest,
5252
ChatResponse modelResponse,
53-
IEnumerable<ChatMessage>? includedHistory,
53+
IEnumerable<ChatMessage>? conversationHistory,
5454
IEnumerable<EvaluationContext>? additionalContext,
5555
CancellationToken cancellationToken)
5656
{

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/EquivalenceEvaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ await base.EvaluateAsync(
7575
protected override async ValueTask<string> RenderEvaluationPromptAsync(
7676
ChatMessage? userRequest,
7777
ChatResponse modelResponse,
78-
IEnumerable<ChatMessage>? includedHistory,
78+
IEnumerable<ChatMessage>? conversationHistory,
7979
IEnumerable<EvaluationContext>? additionalContext,
8080
CancellationToken cancellationToken)
8181
{

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/FluencyEvaluator.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ public sealed class FluencyEvaluator : SingleNumericMetricEvaluator
4949
protected override async ValueTask<string> RenderEvaluationPromptAsync(
5050
ChatMessage? userRequest,
5151
ChatResponse modelResponse,
52-
IEnumerable<ChatMessage>? includedHistory,
52+
IEnumerable<ChatMessage>? conversationHistory,
5353
IEnumerable<EvaluationContext>? additionalContext,
5454
CancellationToken cancellationToken)
5555
{

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/GroundednessEvaluator.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ await base.EvaluateAsync(
7777
protected override async ValueTask<string> RenderEvaluationPromptAsync(
7878
ChatMessage? userRequest,
7979
ChatResponse modelResponse,
80-
IEnumerable<ChatMessage>? includedHistory,
80+
IEnumerable<ChatMessage>? conversationHistory,
8181
IEnumerable<EvaluationContext>? additionalContext,
8282
CancellationToken cancellationToken)
8383
{
@@ -99,9 +99,9 @@ userRequest is not null
9999
_ = builder.AppendLine();
100100
}
101101

102-
if (includedHistory is not null)
102+
if (conversationHistory is not null)
103103
{
104-
foreach (ChatMessage message in includedHistory)
104+
foreach (ChatMessage message in conversationHistory)
105105
{
106106
_ = builder.Append(await RenderAsync(message, cancellationToken).ConfigureAwait(false));
107107
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/RelevanceTruthAndCompletenessEvaluator.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ protected override EvaluationResult InitializeResult()
9393
protected override async ValueTask<string> RenderEvaluationPromptAsync(
9494
ChatMessage? userRequest,
9595
ChatResponse modelResponse,
96-
IEnumerable<ChatMessage>? includedHistory,
96+
IEnumerable<ChatMessage>? conversationHistory,
9797
IEnumerable<EvaluationContext>? additionalContext,
9898
CancellationToken cancellationToken)
9999
{
@@ -107,9 +107,9 @@ userRequest is not null
107107
: string.Empty;
108108

109109
var builder = new StringBuilder();
110-
if (includedHistory is not null)
110+
if (conversationHistory is not null)
111111
{
112-
foreach (ChatMessage message in includedHistory)
112+
foreach (ChatMessage message in conversationHistory)
113113
{
114114
_ = builder.Append(await RenderAsync(message, cancellationToken).ConfigureAwait(false));
115115
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Quality/Utilities/JsonOutputFixer.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ internal static ReadOnlySpan<char> TrimMarkdownDelimiters(string json)
2424
// Trim 'json' marker from markdown if it exists.
2525
const string JsonMarker = "json";
2626
int markerLength = JsonMarker.Length;
27-
if (trimmed.Length > markerLength && trimmed[0..markerLength].SequenceEqual(JsonMarker.AsSpan()))
27+
if (trimmed.Length > markerLength && trimmed.Slice(0, markerLength).SequenceEqual(JsonMarker.AsSpan()))
2828
{
2929
trimmed = trimmed.Slice(markerLength);
3030
}

src/Libraries/Microsoft.Extensions.AI.Evaluation.Reporting.Azure/Storage/AzureStorageReportingConfiguration.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,9 @@ public static class AzureStorageReportingConfiguration
2929
/// survive in the cache before they are considered expired and evicted.
3030
/// </param>
3131
/// <param name="chatConfiguration">
32-
/// A <see cref="ChatConfiguration"/> that specifies the <see cref="IChatClient"/> and the
33-
/// <see cref="IEvaluationTokenCounter"/> that are used by AI-based <paramref name="evaluators"/> included in the
34-
/// returned <see cref="ReportingConfiguration"/>. Can be omitted if none of the included
35-
/// <paramref name="evaluators"/> are AI-based.
32+
/// A <see cref="ChatConfiguration"/> that specifies the <see cref="IChatClient"/> that is used by AI-based
33+
/// <paramref name="evaluators"/> included in the returned <see cref="ReportingConfiguration"/>. Can be omitted if
34+
/// none of the included <paramref name="evaluators"/> are AI-based.
3635
/// </param>
3736
/// <param name="enableResponseCaching">
3837
/// <see langword="true"/> to enable caching of AI responses; <see langword="false"/> otherwise.

0 commit comments

Comments
 (0)