feat: adding gpu_partition_size parameter to Endpoint.deploy() method.

vertex-sdk-bot · copybara-github · commit 7ebbddb6942e · 2025-08-20T11:59:05.000-07:00
PiperOrigin-RevId: 797403888
diff --git a/google/cloud/aiplatform/models.py b/google/cloud/aiplatform/models.py
@@ -62,9 +62,11 @@
     deployment_resource_pool as gca_deployment_resource_pool_compat,
     deployed_model_ref as gca_deployed_model_ref_compat,
     encryption_spec as gca_encryption_spec,
+    endpoint_v1beta1 as gca_endpoint_v1beta1_compat,
     endpoint as gca_endpoint_compat,
     explanation as gca_explanation_compat,
     io as gca_io_compat,
+    machine_resources_v1beta1 as gca_machine_resources_v1beta1_compat,
     machine_resources as gca_machine_resources_compat,
     model as gca_model_compat,
     model_service as gca_model_service_compat,
@@ -1352,6 +1354,7 @@ def deploy(
         max_replica_count: int = 1,
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
+        gpu_partition_size: Optional[str] = None,
         tpu_topology: Optional[str] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
@@ -1425,6 +1428,8 @@ def deploy(
                 NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
             accelerator_count (int):
                 Optional. The number of accelerators to attach to a worker replica.
+            gpu_partition_size (str):
+                Optional. The GPU partition Size for Nvidia MIG.
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
@@ -1537,6 +1542,7 @@ def deploy(
             max_replica_count=max_replica_count,
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
+            gpu_partition_size=gpu_partition_size,
             tpu_topology=tpu_topology,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
@@ -1572,6 +1578,7 @@ def _deploy(
         max_replica_count: int = 1,
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
+        gpu_partition_size: Optional[str] = None,
         tpu_topology: Optional[str] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
@@ -1642,6 +1649,8 @@ def _deploy(
                 NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
             accelerator_count (int):
                 Optional. The number of accelerators to attach to a worker replica.
+            gpu_partition_size (str):
+                Optional. The GPU partition size for NVidia MIG.
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
@@ -1738,6 +1747,7 @@ def _deploy(
             max_replica_count=max_replica_count,
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
+            gpu_partition_size=gpu_partition_size,
             tpu_topology=tpu_topology,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
@@ -1780,6 +1790,7 @@ def _deploy_call(
         max_replica_count: int = 1,
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
+        gpu_partition_size: Optional[str] = None,
         tpu_topology: Optional[str] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
@@ -1859,6 +1870,8 @@ def _deploy_call(
                 NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
             accelerator_count (int):
                 Optional. The number of accelerators to attach to a worker replica.
+            gpu_partition_size (str):
+                Optional. The GPU partition Size for Nvidia MIG.
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
@@ -1942,15 +1955,35 @@ def _deploy_call(
             ValueError: If both `explanation_spec` and `deployment_resource_pool`
                 are present.
         """
+        # The two features are incompatible due to API versioning issue.
+        # TODO(b/436626409) after adding the disable_container_logging to v1 proto
+        # remove the incomaptiblity check.
+        if gpu_partition_size and disable_container_logging:
+            _LOGGER.warning(
+                "Cannot set both gpu_partition_size and disable_container_logging. disable_container_logging will be ignored."
+            )
+
+        gca_endpoint = gca_endpoint_compat
+        gca_machine_resources = gca_machine_resources_compat
+        if gpu_partition_size:
+            gca_machine_resources = gca_machine_resources_v1beta1_compat
+            gca_endpoint = gca_endpoint_v1beta1_compat
+            api_client = api_client.select_version("v1beta1")
         service_account = service_account or initializer.global_config.service_account
 
         if deployment_resource_pool:
-            deployed_model = gca_endpoint_compat.DeployedModel(
+            deployed_model = gca_endpoint.DeployedModel(
                 model=model.versioned_resource_name,
                 display_name=deployed_model_display_name,
                 service_account=service_account,
-                disable_container_logging=disable_container_logging,
             )
+            if not gpu_partition_size:
+                deployed_model = gca_endpoint.DeployedModel(
+                    model=model.versioned_resource_name,
+                    display_name=deployed_model_display_name,
+                    service_account=service_account,
+                    disable_container_logging=disable_container_logging,
+                )
 
             if system_labels:
                 deployed_model.system_labels = system_labels
@@ -2012,14 +2045,20 @@ def _deploy_call(
                     "Both `accelerator_type` and `accelerator_count` should be set "
                     "when specifying autoscaling_target_accelerator_duty_cycle`"
                 )
-
-            deployed_model = gca_endpoint_compat.DeployedModel(
+            deployed_model = gca_endpoint.DeployedModel(
                 model=model.versioned_resource_name,
                 display_name=deployed_model_display_name,
                 service_account=service_account,
                 enable_access_logging=enable_access_logging,
-                disable_container_logging=disable_container_logging,
             )
+            if not gpu_partition_size:
+                deployed_model = gca_endpoint.DeployedModel(
+                    model=model.versioned_resource_name,
+                    display_name=deployed_model_display_name,
+                    service_account=service_account,
+                    enable_access_logging=enable_access_logging,
+                    disable_container_logging=disable_container_logging,
+                )
 
             if system_labels:
                 deployed_model.system_labels = system_labels
@@ -2066,19 +2105,19 @@ def _deploy_call(
                 _LOGGER.info(f"Using default machine_type: {machine_type}")
 
             if use_dedicated_resources:
-                dedicated_resources = gca_machine_resources_compat.DedicatedResources(
+                dedicated_resources = gca_machine_resources.DedicatedResources(
                     min_replica_count=min_replica_count,
                     max_replica_count=max_replica_count,
                     spot=spot,
                     required_replica_count=required_replica_count,
                 )
 
-                machine_spec = gca_machine_resources_compat.MachineSpec(
+                machine_spec = gca_machine_resources.MachineSpec(
                     machine_type=machine_type
                 )
 
                 if autoscaling_target_cpu_utilization:
-                    autoscaling_metric_spec = gca_machine_resources_compat.AutoscalingMetricSpec(
+                    autoscaling_metric_spec = gca_machine_resources.AutoscalingMetricSpec(
                         metric_name="aiplatform.googleapis.com/prediction/online/cpu/utilization",
                         target=autoscaling_target_cpu_utilization,
                     )
@@ -2092,17 +2131,20 @@ def _deploy_call(
                     machine_spec.accelerator_count = accelerator_count
 
                     if autoscaling_target_accelerator_duty_cycle:
-                        autoscaling_metric_spec = gca_machine_resources_compat.AutoscalingMetricSpec(
+                        autoscaling_metric_spec = gca_machine_resources.AutoscalingMetricSpec(
                             metric_name="aiplatform.googleapis.com/prediction/online/accelerator/duty_cycle",
                             target=autoscaling_target_accelerator_duty_cycle,
                         )
                         dedicated_resources.autoscaling_metric_specs.extend(
                             [autoscaling_metric_spec]
                         )
 
+                if gpu_partition_size:
+                    machine_spec.gpu_partition_size = gpu_partition_size
+
                 if autoscaling_target_request_count_per_minute:
                     autoscaling_metric_spec = (
-                        gca_machine_resources_compat.AutoscalingMetricSpec(
+                        gca_machine_resources.AutoscalingMetricSpec(
                             metric_name=(
                                 "aiplatform.googleapis.com/prediction/online/"
                                 "request_count"
@@ -2115,7 +2157,7 @@ def _deploy_call(
                     )
 
                 if autoscaling_target_pubsub_num_undelivered_messages:
-                    autoscaling_metric_spec = gca_machine_resources_compat.AutoscalingMetricSpec(
+                    autoscaling_metric_spec = gca_machine_resources.AutoscalingMetricSpec(
                         metric_name=(
                             "pubsub.googleapis.com/subscription/"
                             "num_undelivered_messages"
@@ -2141,14 +2183,14 @@ def _deploy_call(
                 deployed_model.dedicated_resources = dedicated_resources
                 if fast_tryout_enabled:
                     deployed_model.faster_deployment_config = (
-                        gca_endpoint_compat.FasterDeploymentConfig(
+                        gca_endpoint.FasterDeploymentConfig(
                             fast_tryout_enabled=fast_tryout_enabled
                         )
                     )
 
             elif supports_automatic_resources:
                 deployed_model.automatic_resources = (
-                    gca_machine_resources_compat.AutomaticResources(
+                    gca_machine_resources.AutomaticResources(
                         min_replica_count=min_replica_count,
                         max_replica_count=max_replica_count,
                     )
diff --git a/tests/unit/aiplatform/test_endpoints.py b/tests/unit/aiplatform/test_endpoints.py
@@ -47,6 +47,7 @@
     deployment_resource_pool_v1 as gca_deployment_resource_pool_v1,
     deployment_resource_pool_v1beta1 as gca_deployment_resource_pool_v1beta1,
     encryption_spec as gca_encryption_spec,
+    encryption_spec_v1beta1 as gca_encryption_spec_v1beta1,
     endpoint_service_v1beta1 as gca_endpoint_service_v1beta1,
     endpoint_service as gca_endpoint_service,
     endpoint_v1beta1 as gca_endpoint_v1beta1,
@@ -134,6 +135,7 @@
 _TEST_MACHINE_TYPE = "n1-standard-32"
 _TEST_ACCELERATOR_TYPE = "NVIDIA_TESLA_P100"
 _TEST_ACCELERATOR_COUNT = 2
+_TEST_GPU_PARTITION_SIZE = "1g.10gb"
 
 _TEST_METRIC_NAME_CPU_UTILIZATION = (
     "aiplatform.googleapis.com/prediction/online/cpu/utilization"
@@ -216,6 +218,9 @@
 _TEST_ENCRYPTION_SPEC = gca_encryption_spec.EncryptionSpec(
     kms_key_name=_TEST_ENCRYPTION_KEY_NAME
 )
+_TEST_ENCRYPTION_SPEC_V1BETA1 = gca_encryption_spec_v1beta1.EncryptionSpec(
+    kms_key_name=_TEST_ENCRYPTION_KEY_NAME
+)
 
 _TEST_ENDPOINT_GAPIC = gca_endpoint.Endpoint(
     display_name=_TEST_DISPLAY_NAME, name=_TEST_ENDPOINT_NAME
@@ -304,6 +309,19 @@ def get_endpoint_mock():
         yield get_endpoint_mock
 
 
+@pytest.fixture
+def get_endpoint_v1beta1_mock():
+    with mock.patch.object(
+        endpoint_service_client_v1beta1.EndpointServiceClient, "get_endpoint"
+    ) as get_endpoint_mock:
+        get_endpoint_mock.return_value = gca_endpoint_v1beta1.Endpoint(
+            display_name=_TEST_DISPLAY_NAME,
+            name=_TEST_ENDPOINT_NAME,
+            encryption_spec=_TEST_ENCRYPTION_SPEC_V1BETA1,
+        )
+        yield get_endpoint_mock
+
+
 @pytest.fixture
 def get_empty_endpoint_mock():
     with mock.patch.object(
@@ -458,6 +476,25 @@ def deploy_model_mock():
         yield deploy_model_mock
 
 
+@pytest.fixture
+def deploy_model_mock_v1beta1():
+    with mock.patch.object(
+        endpoint_service_client_v1beta1.EndpointServiceClient, "deploy_model"
+    ) as deploy_model_mock:
+        deployed_model = gca_endpoint_v1beta1.DeployedModel(
+            model=_TEST_MODEL_NAME,
+            display_name=_TEST_DISPLAY_NAME,
+        )
+        deploy_model_lro_mock = mock.Mock(ga_operation.Operation)
+        deploy_model_lro_mock.result.return_value = (
+            gca_endpoint_service_v1beta1.DeployModelResponse(
+                deployed_model=deployed_model,
+            )
+        )
+        deploy_model_mock.return_value = deploy_model_lro_mock
+        yield deploy_model_mock
+
+
 @pytest.fixture
 def preview_deploy_model_mock():
     with mock.patch.object(
@@ -1925,6 +1962,57 @@ def test_deploy_with_dedicated_resources(self, deploy_model_mock, sync):
             timeout=None,
         )
 
+    @pytest.mark.usefixtures(
+        "get_endpoint_mock", "get_endpoint_v1beta1_mock", "get_model_mock"
+    )
+    @pytest.mark.parametrize("sync", [True])
+    def test_deploy_with_dedicated_resources_and_gpu_partition_size(
+        self, deploy_model_mock_v1beta1, sync
+    ):
+        test_endpoint = models.Endpoint(_TEST_ENDPOINT_NAME)
+        test_model = models.Model(_TEST_ID)
+        test_model._gca_resource.supported_deployment_resources_types.append(
+            aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES
+        )
+        test_endpoint.deploy(
+            model=test_model,
+            machine_type=_TEST_MACHINE_TYPE,
+            accelerator_type=_TEST_ACCELERATOR_TYPE,
+            accelerator_count=_TEST_ACCELERATOR_COUNT,
+            gpu_partition_size=_TEST_GPU_PARTITION_SIZE,
+            service_account=_TEST_SERVICE_ACCOUNT,
+            sync=sync,
+            deploy_request_timeout=None,
+        )
+
+        if not sync:
+            test_endpoint.wait()
+
+        expected_machine_spec = gca_machine_resources_v1beta1.MachineSpec(
+            machine_type=_TEST_MACHINE_TYPE,
+            accelerator_type=_TEST_ACCELERATOR_TYPE,
+            accelerator_count=_TEST_ACCELERATOR_COUNT,
+            gpu_partition_size=_TEST_GPU_PARTITION_SIZE,
+        )
+        expected_dedicated_resources = gca_machine_resources_v1beta1.DedicatedResources(
+            machine_spec=expected_machine_spec,
+            min_replica_count=1,
+            max_replica_count=1,
+        )
+        expected_deployed_model = gca_endpoint_v1beta1.DeployedModel(
+            dedicated_resources=expected_dedicated_resources,
+            model=test_model.resource_name,
+            display_name=None,
+            service_account=_TEST_SERVICE_ACCOUNT,
+        )
+        deploy_model_mock_v1beta1.assert_called_once_with(
+            endpoint=test_endpoint.resource_name,
+            deployed_model=expected_deployed_model,
+            traffic_split={"0": 100},
+            metadata=(),
+            timeout=None,
+        )
+
     @pytest.mark.usefixtures("get_endpoint_mock", "get_model_mock")
     @pytest.mark.parametrize("sync", [True, False])
     def test_deploy_with_autoscaling_target_cpu_utilization(