googleapis
diff --git a/‎tests/unit/vertexai/genai/replays/test_evaluate_predefined_metrics.py‎
Lines changed: 67 additions & 0 deletions b/‎tests/unit/vertexai/genai/replays/test_evaluate_predefined_metrics.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎tests/unit/vertexai/genai/replays/test_public_generate_rubrics.py‎
Lines changed: 7 additions & 5 deletions b/‎tests/unit/vertexai/genai/replays/test_public_generate_rubrics.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎tests/unit/vertexai/genai/test_evals.py‎
Lines changed: 18 additions & 16 deletions b/‎tests/unit/vertexai/genai/test_evals.py‎
Lines changed: 18 additions & 16 deletions
diff --git a/‎vertexai/_genai/_evals_common.py‎
Lines changed: 1 addition & 1 deletion b/‎vertexai/_genai/_evals_common.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vertexai/_genai/_evals_constant.py‎
Lines changed: 31 additions & 0 deletions b/‎vertexai/_genai/_evals_constant.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎vertexai/_genai/_evals_data_converters.py‎
Lines changed: 21 additions & 0 deletions b/‎vertexai/_genai/_evals_data_converters.py‎
Lines changed: 21 additions & 0 deletions
@@ -0,0 +1,67 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# pylint: disable=protected-access,bad-continuation,missing-function-docstring
+
+from tests.unit.vertexai.genai.replays import pytest_helper
+from vertexai._genai import types
+import pandas as pd
+
+
+def test_evaluation_result(client):
+    """Tests that evaluate() produces a correctly structured EvaluationResult."""
+    prompts_df = pd.DataFrame(
+        {
+            "prompt": ["Explain the concept of machine learning in simple terms."],
+            "response": [
+                "Machine learning is a type of artificial intelligence that allows"
+                " computers to learn from data without being explicitly programmed."
+            ],
+        }
+    )
+
+    eval_dataset = types.EvaluationDataset(
+        eval_dataset_df=prompts_df,
+        candidate_name="gemini-2.5-flash",
+    )
+
+    predefined_metrics = [
+        types.PrebuiltMetric.GENERAL_QUALITY,
+    ]
+
+    evaluation_result = client.evals.evaluate(
+        dataset=eval_dataset,
+        metrics=predefined_metrics,
+    )
+
+    assert isinstance(evaluation_result, types.EvaluationResult)
+
+    assert evaluation_result.summary_metrics is not None
+    for summary in evaluation_result.summary_metrics:
+        assert isinstance(summary, types.AggregatedMetricResult)
+        assert summary.metric_name is not None
+        assert summary.mean_score is not None
+
+    assert evaluation_result.eval_case_results is not None
+    for case_result in evaluation_result.eval_case_results:
+        assert isinstance(case_result, types.EvalCaseResult)
+        assert case_result.eval_case_index is not None
+        assert case_result.response_candidate_results is not None
+
+
+pytestmark = pytest_helper.setup(
+    file=__file__,
+    globals_for_file=globals(),
+    test_method="evals.evaluate",
+)
@@ -154,19 +154,21 @@ def test_public_method_generate_rubrics(client):
             ]
         }
     )
-    data_with_rubrics = client.evals.generate_rubrics(
+    eval_dataset = client.evals.generate_rubrics(
         src=prompts_df,
         prompt_template=_TEST_RUBRIC_GENERATION_PROMPT,
         rubric_group_name="text_quality_rubrics",
     )
+    eval_dataset_df = eval_dataset.eval_dataset_df
 
     # Assertions focus on the returned DataFrame
-    assert isinstance(data_with_rubrics, pd.DataFrame)
-    assert "rubric_groups" in data_with_rubrics.columns
-    assert len(data_with_rubrics) == 2
+    assert isinstance(eval_dataset, types.EvaluationDataset)
+    assert isinstance(eval_dataset_df, pd.DataFrame)
+    assert "rubric_groups" in eval_dataset_df.columns
+    assert len(eval_dataset_df) == 2
 
     # Check the structure of the first row's rubric_groups
-    first_rubric_group = data_with_rubrics["rubric_groups"][0]
+    first_rubric_group = eval_dataset_df["rubric_groups"][0]
     assert isinstance(first_rubric_group, dict)
     assert "text_quality_rubrics" in first_rubric_group
     assert isinstance(first_rubric_group["text_quality_rubrics"], list)
 
@@ -114,22 +114,24 @@ def mock_evaluate_instances_side_effect(*args, **kwargs):
         mock_upload_to_gcs.return_value = (
             "gs://mock-bucket/mock_path/evaluation_result_timestamp.json"
         )
-        mock_prebuilt_safety_metric = vertexai_genai_types.LLMMetric(
-            name="safety", prompt_template="Is this safe? {response}"
+        mock_prebuilt_fluency_metric = vertexai_genai_types.LLMMetric(
+            name="fluency", prompt_template="Is this fluent? {response}"
         )
-        mock_prebuilt_safety_metric._is_predefined = True
-        mock_prebuilt_safety_metric._config_source = "gs://mock-metrics/safety/v1.yaml"
-        mock_prebuilt_safety_metric._version = "v1"
+        mock_prebuilt_fluency_metric._is_predefined = True
+        mock_prebuilt_fluency_metric._config_source = (
+            "gs://mock-metrics/fluency/v1.yaml"
+        )
+        mock_prebuilt_fluency_metric._version = "v1"
 
-        mock_fetch_prebuilt_metric.return_value = mock_prebuilt_safety_metric
+        mock_fetch_prebuilt_metric.return_value = mock_prebuilt_fluency_metric
 
         yield {
             "mock_storage_client": mock_storage_client,
             "mock_bq_client": mock_bq_client,
             "mock_evaluate_instances": mock_evaluate_instances,
             "mock_upload_to_gcs": mock_upload_to_gcs,
             "mock_fetch_prebuilt_metric": mock_fetch_prebuilt_metric,
-            "mock_prebuilt_safety_metric": mock_prebuilt_safety_metric,
+            "mock_prebuilt_fluency_metric": mock_prebuilt_fluency_metric,
         }
 
 
@@ -3156,7 +3158,7 @@ def test_execute_evaluation_with_openai_schema(
             mock_loader_instance.load.return_value = mock_openai_raw_data
 
             with mock.patch.object(
-                _evals_metric_handlers.LLMMetricHandler, "process"
+                _evals_metric_handlers.LLMMetricHandler, "get_metric_result"
             ) as mock_llm_process:
                 mock_llm_process.return_value = (
                     vertexai_genai_types.EvalCaseMetricResult(
@@ -3233,7 +3235,7 @@ def test_llm_metric_default_aggregation_mixed_results(
         )
 
         with mock.patch(
-            "vertexai._genai._evals_metric_handlers.LLMMetricHandler.process"
+            "vertexai._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
         ) as mock_llm_process:
             mock_llm_process.side_effect = [
                 vertexai_genai_types.EvalCaseMetricResult(
@@ -3288,7 +3290,7 @@ def custom_agg_fn(results: list[vertexai_genai_types.EvalCaseMetricResult]):
         )
 
         with mock.patch(
-            "vertexai._genai._evals_metric_handlers.LLMMetricHandler.process"
+            "vertexai._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
         ) as mock_llm_process:
             mock_llm_process.side_effect = [
                 vertexai_genai_types.EvalCaseMetricResult(
@@ -3335,7 +3337,7 @@ def custom_agg_fn_error(
             aggregate_summary_fn=custom_agg_fn_error,
         )
         with mock.patch(
-            "vertexai._genai._evals_metric_handlers.LLMMetricHandler.process"
+            "vertexai._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
         ) as mock_llm_process:
             mock_llm_process.side_effect = [
                 vertexai_genai_types.EvalCaseMetricResult(
@@ -3379,7 +3381,7 @@ def custom_agg_fn_invalid_type(
             aggregate_summary_fn=custom_agg_fn_invalid_type,
         )
         with mock.patch(
-            "vertexai._genai._evals_metric_handlers.LLMMetricHandler.process"
+            "vertexai._genai._evals_metric_handlers.LLMMetricHandler.get_metric_result"
         ) as mock_llm_process:
             mock_llm_process.return_value = vertexai_genai_types.EvalCaseMetricResult(
                 metric_name="invalid_type_fallback", score=0.8
@@ -3405,7 +3407,7 @@ def test_execute_evaluation_lazy_loaded_prebuilt_metric_instance(
         )
 
         lazy_metric_instance = _evals_utils.LazyLoadedPrebuiltMetric(
-            name="safety", version="v1"
+            name="fluency", version="v1"
         )
 
         result = _evals_common._execute_evaluation(
@@ -3421,7 +3423,7 @@ def test_execute_evaluation_lazy_loaded_prebuilt_metric_instance(
         assert result.evaluation_dataset == [input_dataset]
         assert len(result.summary_metrics) == 1
         summary_metric = result.summary_metrics[0]
-        assert summary_metric.metric_name == "safety"
+        assert summary_metric.metric_name == "fluency"
         assert summary_metric.mean_score == 0.9
 
     def test_execute_evaluation_prebuilt_metric_via_loader(
@@ -3434,7 +3436,7 @@ def test_execute_evaluation_prebuilt_metric_via_loader(
             eval_dataset_df=dataset_df
         )
 
-        prebuilt_metric = vertexai_genai_types.PrebuiltMetric.SAFETY
+        prebuilt_metric = vertexai_genai_types.PrebuiltMetric.FLUENCY
 
         result = _evals_common._execute_evaluation(
             api_client=mock_api_client_fixture,
@@ -3449,7 +3451,7 @@ def test_execute_evaluation_prebuilt_metric_via_loader(
         assert result.evaluation_dataset == [input_dataset]
         assert len(result.summary_metrics) == 1
         summary_metric = result.summary_metrics[0]
-        assert summary_metric.metric_name == "safety"
+        assert summary_metric.metric_name == "fluency"
         assert summary_metric.mean_score == 0.9
 
     def test_execute_evaluation_with_gcs_destination(
 
@@ -672,7 +672,7 @@ def _get_dataset_source(
 def _resolve_dataset_inputs(
     dataset: list[types.EvaluationDataset],
     dataset_schema: Optional[Literal["GEMINI", "FLATTEN", "OPENAI"]],
-    loader: _evals_utils.EvalDatasetLoader,
+    loader: "_evals_utils.EvalDatasetLoader",
 ) -> tuple[types.EvaluationDataset, int]:
     """Loads and processes single or multiple datasets for evaluation.
 
 
@@ -0,0 +1,31 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Constants for evals module."""
+
+SUPPORTED_PREDEFINED_METRICS = frozenset(
+    {
+        "general_quality_v1",
+        "text_quality_v1",
+        "instruction_following_v1",
+        "grounding_v1",
+        "safety_v1",
+        "multi_turn_general_quality_v1",
+        "multi_turn_text_quality_v1",
+        "final_response_match_v2",
+        "final_response_reference_free_v1",
+        "partially_customizable_general_quality_v1",
+        "fully_customizable_general_quality_v1",
+    }
+)
@@ -207,6 +207,7 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
             response_data = item.pop("response", None)
             reference_data = item.pop("reference", None)
             system_instruction_data = item.pop("instruction", None)
+            rubric_groups_data = item.pop("rubric_groups", None)
 
             if not response_data:
                 raise ValueError(
@@ -287,13 +288,33 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
                 elif isinstance(system_instruction_data, genai_types.Content):
                     system_instruction = system_instruction_data
 
+            rubric_groups: Optional[dict[str, types.RubricGroup]] = None
+            if rubric_groups_data:
+                if isinstance(rubric_groups_data, dict):
+                    rubric_groups = {}
+                    for key, value in rubric_groups_data.items():
+                        if isinstance(value, dict):
+                            rubric_groups[key] = types.RubricGroup.model_validate(value)
+                        elif isinstance(value, types.RubricGroup):
+                            rubric_groups[key] = value
+                        else:
+                            logger.warning(
+                                f"Invalid type for rubric group '{key}' in case {i}."
+                                " Expected dict or RubricGroup."
+                            )
+                else:
+                    logger.warning(
+                        f"Invalid type for rubric_groups in case {i}. Expected dict."
+                    )
+
             eval_case = types.EvalCase(
                 eval_case_id=eval_case_id,
                 prompt=prompt,
                 responses=responses,
                 reference=reference,
                 conversation_history=conversation_history,
                 system_instruction=system_instruction,
+                rubric_groups=rubric_groups,
                 **item,  # Pass remaining columns as extra fields to EvalCase.
                 # They can be used for custom metric prompt templates.
             )