Skip to content

Commit 248a365

Browse files
jsondaicopybara-github
authored andcommitted
feat: GenAI SDK client(evals) - Implement Predefined metrics support in Vertex GenAI Eval SDK
chore: GenAI SDK client(evals) - Eval generate_rubrics and run_inference method now return EvlauationDataset type. You can reference the the underlying DataFrame with the eval_dataset_df property PiperOrigin-RevId: 799457792
1 parent eff6c58 commit 248a365

File tree

11 files changed

+1072
-395
lines changed

11 files changed

+1072
-395
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# pylint: disable=protected-access,bad-continuation,missing-function-docstring
16+
17+
from tests.unit.vertexai.genai.replays import pytest_helper
18+
from vertexai._genai import types
19+
import pandas as pd
20+
21+
22+
def test_evaluation_result(client):
23+
"""Tests that evaluate() produces a correctly structured EvaluationResult."""
24+
prompts_df = pd.DataFrame(
25+
{
26+
"prompt": ["Explain the concept of machine learning in simple terms."],
27+
"response": [
28+
"Machine learning is a type of artificial intelligence that allows"
29+
" computers to learn from data without being explicitly programmed."
30+
],
31+
}
32+
)
33+
34+
eval_dataset = types.EvaluationDataset(
35+
eval_dataset_df=prompts_df,
36+
candidate_name="gemini-2.5-flash",
37+
)
38+
39+
predefined_metrics = [
40+
types.PrebuiltMetric.GENERAL_QUALITY,
41+
]
42+
43+
evaluation_result = client.evals.evaluate(
44+
dataset=eval_dataset,
45+
metrics=predefined_metrics,
46+
)
47+
48+
assert isinstance(evaluation_result, types.EvaluationResult)
49+
50+
assert evaluation_result.summary_metrics is not None
51+
for summary in evaluation_result.summary_metrics:
52+
assert isinstance(summary, types.AggregatedMetricResult)
53+
assert summary.metric_name is not None
54+
assert summary.mean_score is not None
55+
56+
assert evaluation_result.eval_case_results is not None
57+
for case_result in evaluation_result.eval_case_results:
58+
assert isinstance(case_result, types.EvalCaseResult)
59+
assert case_result.eval_case_index is not None
60+
assert case_result.response_candidate_results is not None
61+
62+
63+
pytestmark = pytest_helper.setup(
64+
file=__file__,
65+
globals_for_file=globals(),
66+
test_method="evals.evaluate",
67+
)

tests/unit/vertexai/genai/replays/test_public_generate_rubrics.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -154,19 +154,21 @@ def test_public_method_generate_rubrics(client):
154154
]
155155
}
156156
)
157-
data_with_rubrics = client.evals.generate_rubrics(
157+
eval_dataset = client.evals.generate_rubrics(
158158
src=prompts_df,
159159
prompt_template=_TEST_RUBRIC_GENERATION_PROMPT,
160160
rubric_group_name="text_quality_rubrics",
161161
)
162+
eval_dataset_df = eval_dataset.eval_dataset_df
162163

163164
# Assertions focus on the returned DataFrame
164-
assert isinstance(data_with_rubrics, pd.DataFrame)
165-
assert "rubric_groups" in data_with_rubrics.columns
166-
assert len(data_with_rubrics) == 2
165+
assert isinstance(eval_dataset, types.EvaluationDataset)
166+
assert isinstance(eval_dataset_df, pd.DataFrame)
167+
assert "rubric_groups" in eval_dataset_df.columns
168+
assert len(eval_dataset_df) == 2
167169

168170
# Check the structure of the first row's rubric_groups
169-
first_rubric_group = data_with_rubrics["rubric_groups"][0]
171+
first_rubric_group = eval_dataset_df["rubric_groups"][0]
170172
assert isinstance(first_rubric_group, dict)
171173
assert "text_quality_rubrics" in first_rubric_group
172174
assert isinstance(first_rubric_group["text_quality_rubrics"], list)

tests/unit/vertexai/genai/test_evals.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -114,22 +114,24 @@ def mock_evaluate_instances_side_effect(*args, **kwargs):
114114
mock_upload_to_gcs.return_value = (
115115
"gs://mock-bucket/mock_path/evaluation_result_timestamp.json"
116116
)
117-
mock_prebuilt_safety_metric = vertexai_genai_types.LLMMetric(
118-
name="safety", prompt_template="Is this safe? {response}"
117+
mock_prebuilt_fluency_metric = vertexai_genai_types.LLMMetric(
118+
name="fluency", prompt_template="Is this fluent? {response}"
119119
)
120-
mock_prebuilt_safety_metric._is_predefined = True
121-
mock_prebuilt_safety_metric._config_source = "gs://mock-metrics/safety/v1.yaml"
122-
mock_prebuilt_safety_metric._version = "v1"
120+
mock_prebuilt_fluency_metric._is_predefined = True
121+
mock_prebuilt_fluency_metric._config_source = (
122+
"gs://mock-metrics/fluency/v1.yaml"
123+
)
124+
mock_prebuilt_fluency_metric._version = "v1"
123125

124-
mock_fetch_prebuilt_metric.return_value = mock_prebuilt_safety_metric
126+
mock_fetch_prebuilt_metric.return_value = mock_prebuilt_fluency_metric
125127

126128
yield {
127129
"mock_storage_client": mock_storage_client,
128130
"mock_bq_client": mock_bq_client,
129131
"mock_evaluate_instances": mock_evaluate_instances,
130132
"mock_upload_to_gcs": mock_upload_to_gcs,
131133
"mock_fetch_prebuilt_metric": mock_fetch_prebuilt_metric,
132-
"mock_prebuilt_safety_metric": mock_prebuilt_safety_metric,
134+
"mock_prebuilt_fluency_metric": mock_prebuilt_fluency_metric,
133135
}
134136

135137

@@ -3156,7 +3158,7 @@ def test_execute_evaluation_with_openai_schema(
31563158
mock_loader_instance.load.return_value = mock_openai_raw_data
31573159

31583160
with mock.patch.object(
3159-
_evals_metric_handlers.LLMMetricHandler, "process"
3161+
_evals_metric_handlers.LLMMetricHandler, "get_metric_result"
31603162
) as mock_llm_process:
31613163
mock_llm_process.return_value = (
31623164
vertexai_genai_types.EvalCaseMetricResult(
@@ -3233,7 +3235,7 @@ def test_llm_metric_default_aggregation_mixed_results(
32333235
)
32343236

32353237
with mock.patch(
3236-
"vertexai._genai._evals_metric_handlers.LLMMetricHandler.process"
3238+
"vertexai._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
32373239
) as mock_llm_process:
32383240
mock_llm_process.side_effect = [
32393241
vertexai_genai_types.EvalCaseMetricResult(
@@ -3288,7 +3290,7 @@ def custom_agg_fn(results: list[vertexai_genai_types.EvalCaseMetricResult]):
32883290
)
32893291

32903292
with mock.patch(
3291-
"vertexai._genai._evals_metric_handlers.LLMMetricHandler.process"
3293+
"vertexai._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
32923294
) as mock_llm_process:
32933295
mock_llm_process.side_effect = [
32943296
vertexai_genai_types.EvalCaseMetricResult(
@@ -3335,7 +3337,7 @@ def custom_agg_fn_error(
33353337
aggregate_summary_fn=custom_agg_fn_error,
33363338
)
33373339
with mock.patch(
3338-
"vertexai._genai._evals_metric_handlers.LLMMetricHandler.process"
3340+
"vertexai._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
33393341
) as mock_llm_process:
33403342
mock_llm_process.side_effect = [
33413343
vertexai_genai_types.EvalCaseMetricResult(
@@ -3379,7 +3381,7 @@ def custom_agg_fn_invalid_type(
33793381
aggregate_summary_fn=custom_agg_fn_invalid_type,
33803382
)
33813383
with mock.patch(
3382-
"vertexai._genai._evals_metric_handlers.LLMMetricHandler.process"
3384+
"vertexai._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
33833385
) as mock_llm_process:
33843386
mock_llm_process.return_value = vertexai_genai_types.EvalCaseMetricResult(
33853387
metric_name="invalid_type_fallback", score=0.8
@@ -3405,7 +3407,7 @@ def test_execute_evaluation_lazy_loaded_prebuilt_metric_instance(
34053407
)
34063408

34073409
lazy_metric_instance = _evals_utils.LazyLoadedPrebuiltMetric(
3408-
name="safety", version="v1"
3410+
name="fluency", version="v1"
34093411
)
34103412

34113413
result = _evals_common._execute_evaluation(
@@ -3421,7 +3423,7 @@ def test_execute_evaluation_lazy_loaded_prebuilt_metric_instance(
34213423
assert result.evaluation_dataset == [input_dataset]
34223424
assert len(result.summary_metrics) == 1
34233425
summary_metric = result.summary_metrics[0]
3424-
assert summary_metric.metric_name == "safety"
3426+
assert summary_metric.metric_name == "fluency"
34253427
assert summary_metric.mean_score == 0.9
34263428

34273429
def test_execute_evaluation_prebuilt_metric_via_loader(
@@ -3434,7 +3436,7 @@ def test_execute_evaluation_prebuilt_metric_via_loader(
34343436
eval_dataset_df=dataset_df
34353437
)
34363438

3437-
prebuilt_metric = vertexai_genai_types.PrebuiltMetric.SAFETY
3439+
prebuilt_metric = vertexai_genai_types.PrebuiltMetric.FLUENCY
34383440

34393441
result = _evals_common._execute_evaluation(
34403442
api_client=mock_api_client_fixture,
@@ -3449,7 +3451,7 @@ def test_execute_evaluation_prebuilt_metric_via_loader(
34493451
assert result.evaluation_dataset == [input_dataset]
34503452
assert len(result.summary_metrics) == 1
34513453
summary_metric = result.summary_metrics[0]
3452-
assert summary_metric.metric_name == "safety"
3454+
assert summary_metric.metric_name == "fluency"
34533455
assert summary_metric.mean_score == 0.9
34543456

34553457
def test_execute_evaluation_with_gcs_destination(

vertexai/_genai/_evals_common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ def _get_dataset_source(
672672
def _resolve_dataset_inputs(
673673
dataset: list[types.EvaluationDataset],
674674
dataset_schema: Optional[Literal["GEMINI", "FLATTEN", "OPENAI"]],
675-
loader: _evals_utils.EvalDatasetLoader,
675+
loader: "_evals_utils.EvalDatasetLoader",
676676
) -> tuple[types.EvaluationDataset, int]:
677677
"""Loads and processes single or multiple datasets for evaluation.
678678

vertexai/_genai/_evals_constant.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
"""Constants for evals module."""
16+
17+
SUPPORTED_PREDEFINED_METRICS = frozenset(
18+
{
19+
"general_quality_v1",
20+
"text_quality_v1",
21+
"instruction_following_v1",
22+
"grounding_v1",
23+
"safety_v1",
24+
"multi_turn_general_quality_v1",
25+
"multi_turn_text_quality_v1",
26+
"final_response_match_v2",
27+
"final_response_reference_free_v1",
28+
"partially_customizable_general_quality_v1",
29+
"fully_customizable_general_quality_v1",
30+
}
31+
)

vertexai/_genai/_evals_data_converters.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
207207
response_data = item.pop("response", None)
208208
reference_data = item.pop("reference", None)
209209
system_instruction_data = item.pop("instruction", None)
210+
rubric_groups_data = item.pop("rubric_groups", None)
210211

211212
if not response_data:
212213
raise ValueError(
@@ -287,13 +288,33 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
287288
elif isinstance(system_instruction_data, genai_types.Content):
288289
system_instruction = system_instruction_data
289290

291+
rubric_groups: Optional[dict[str, types.RubricGroup]] = None
292+
if rubric_groups_data:
293+
if isinstance(rubric_groups_data, dict):
294+
rubric_groups = {}
295+
for key, value in rubric_groups_data.items():
296+
if isinstance(value, dict):
297+
rubric_groups[key] = types.RubricGroup.model_validate(value)
298+
elif isinstance(value, types.RubricGroup):
299+
rubric_groups[key] = value
300+
else:
301+
logger.warning(
302+
f"Invalid type for rubric group '{key}' in case {i}."
303+
" Expected dict or RubricGroup."
304+
)
305+
else:
306+
logger.warning(
307+
f"Invalid type for rubric_groups in case {i}. Expected dict."
308+
)
309+
290310
eval_case = types.EvalCase(
291311
eval_case_id=eval_case_id,
292312
prompt=prompt,
293313
responses=responses,
294314
reference=reference,
295315
conversation_history=conversation_history,
296316
system_instruction=system_instruction,
317+
rubric_groups=rubric_groups,
297318
**item, # Pass remaining columns as extra fields to EvalCase.
298319
# They can be used for custom metric prompt templates.
299320
)

0 commit comments

Comments
 (0)