Skip to content

Commit 7ebbddb

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: adding gpu_partition_size parameter to Endpoint.deploy() method.
PiperOrigin-RevId: 797403888
1 parent a885b5d commit 7ebbddb

File tree

2 files changed

+143
-13
lines changed

2 files changed

+143
-13
lines changed

google/cloud/aiplatform/models.py

Lines changed: 55 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@
6262
deployment_resource_pool as gca_deployment_resource_pool_compat,
6363
deployed_model_ref as gca_deployed_model_ref_compat,
6464
encryption_spec as gca_encryption_spec,
65+
endpoint_v1beta1 as gca_endpoint_v1beta1_compat,
6566
endpoint as gca_endpoint_compat,
6667
explanation as gca_explanation_compat,
6768
io as gca_io_compat,
69+
machine_resources_v1beta1 as gca_machine_resources_v1beta1_compat,
6870
machine_resources as gca_machine_resources_compat,
6971
model as gca_model_compat,
7072
model_service as gca_model_service_compat,
@@ -1352,6 +1354,7 @@ def deploy(
13521354
max_replica_count: int = 1,
13531355
accelerator_type: Optional[str] = None,
13541356
accelerator_count: Optional[int] = None,
1357+
gpu_partition_size: Optional[str] = None,
13551358
tpu_topology: Optional[str] = None,
13561359
service_account: Optional[str] = None,
13571360
explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
@@ -1425,6 +1428,8 @@ def deploy(
14251428
NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
14261429
accelerator_count (int):
14271430
Optional. The number of accelerators to attach to a worker replica.
1431+
gpu_partition_size (str):
1432+
Optional. The GPU partition Size for Nvidia MIG.
14281433
tpu_topology (str):
14291434
Optional. The TPU topology to use for the DeployedModel.
14301435
Required for CloudTPU multihost deployments.
@@ -1537,6 +1542,7 @@ def deploy(
15371542
max_replica_count=max_replica_count,
15381543
accelerator_type=accelerator_type,
15391544
accelerator_count=accelerator_count,
1545+
gpu_partition_size=gpu_partition_size,
15401546
tpu_topology=tpu_topology,
15411547
reservation_affinity_type=reservation_affinity_type,
15421548
reservation_affinity_key=reservation_affinity_key,
@@ -1572,6 +1578,7 @@ def _deploy(
15721578
max_replica_count: int = 1,
15731579
accelerator_type: Optional[str] = None,
15741580
accelerator_count: Optional[int] = None,
1581+
gpu_partition_size: Optional[str] = None,
15751582
tpu_topology: Optional[str] = None,
15761583
reservation_affinity_type: Optional[str] = None,
15771584
reservation_affinity_key: Optional[str] = None,
@@ -1642,6 +1649,8 @@ def _deploy(
16421649
NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
16431650
accelerator_count (int):
16441651
Optional. The number of accelerators to attach to a worker replica.
1652+
gpu_partition_size (str):
1653+
Optional. The GPU partition size for NVidia MIG.
16451654
tpu_topology (str):
16461655
Optional. The TPU topology to use for the DeployedModel.
16471656
Required for CloudTPU multihost deployments.
@@ -1738,6 +1747,7 @@ def _deploy(
17381747
max_replica_count=max_replica_count,
17391748
accelerator_type=accelerator_type,
17401749
accelerator_count=accelerator_count,
1750+
gpu_partition_size=gpu_partition_size,
17411751
tpu_topology=tpu_topology,
17421752
reservation_affinity_type=reservation_affinity_type,
17431753
reservation_affinity_key=reservation_affinity_key,
@@ -1780,6 +1790,7 @@ def _deploy_call(
17801790
max_replica_count: int = 1,
17811791
accelerator_type: Optional[str] = None,
17821792
accelerator_count: Optional[int] = None,
1793+
gpu_partition_size: Optional[str] = None,
17831794
tpu_topology: Optional[str] = None,
17841795
reservation_affinity_type: Optional[str] = None,
17851796
reservation_affinity_key: Optional[str] = None,
@@ -1859,6 +1870,8 @@ def _deploy_call(
18591870
NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
18601871
accelerator_count (int):
18611872
Optional. The number of accelerators to attach to a worker replica.
1873+
gpu_partition_size (str):
1874+
Optional. The GPU partition Size for Nvidia MIG.
18621875
tpu_topology (str):
18631876
Optional. The TPU topology to use for the DeployedModel.
18641877
Required for CloudTPU multihost deployments.
@@ -1942,15 +1955,35 @@ def _deploy_call(
19421955
ValueError: If both `explanation_spec` and `deployment_resource_pool`
19431956
are present.
19441957
"""
1958+
# The two features are incompatible due to API versioning issue.
1959+
# TODO(b/436626409) after adding the disable_container_logging to v1 proto
1960+
# remove the incomaptiblity check.
1961+
if gpu_partition_size and disable_container_logging:
1962+
_LOGGER.warning(
1963+
"Cannot set both gpu_partition_size and disable_container_logging. disable_container_logging will be ignored."
1964+
)
1965+
1966+
gca_endpoint = gca_endpoint_compat
1967+
gca_machine_resources = gca_machine_resources_compat
1968+
if gpu_partition_size:
1969+
gca_machine_resources = gca_machine_resources_v1beta1_compat
1970+
gca_endpoint = gca_endpoint_v1beta1_compat
1971+
api_client = api_client.select_version("v1beta1")
19451972
service_account = service_account or initializer.global_config.service_account
19461973

19471974
if deployment_resource_pool:
1948-
deployed_model = gca_endpoint_compat.DeployedModel(
1975+
deployed_model = gca_endpoint.DeployedModel(
19491976
model=model.versioned_resource_name,
19501977
display_name=deployed_model_display_name,
19511978
service_account=service_account,
1952-
disable_container_logging=disable_container_logging,
19531979
)
1980+
if not gpu_partition_size:
1981+
deployed_model = gca_endpoint.DeployedModel(
1982+
model=model.versioned_resource_name,
1983+
display_name=deployed_model_display_name,
1984+
service_account=service_account,
1985+
disable_container_logging=disable_container_logging,
1986+
)
19541987

19551988
if system_labels:
19561989
deployed_model.system_labels = system_labels
@@ -2012,14 +2045,20 @@ def _deploy_call(
20122045
"Both `accelerator_type` and `accelerator_count` should be set "
20132046
"when specifying autoscaling_target_accelerator_duty_cycle`"
20142047
)
2015-
2016-
deployed_model = gca_endpoint_compat.DeployedModel(
2048+
deployed_model = gca_endpoint.DeployedModel(
20172049
model=model.versioned_resource_name,
20182050
display_name=deployed_model_display_name,
20192051
service_account=service_account,
20202052
enable_access_logging=enable_access_logging,
2021-
disable_container_logging=disable_container_logging,
20222053
)
2054+
if not gpu_partition_size:
2055+
deployed_model = gca_endpoint.DeployedModel(
2056+
model=model.versioned_resource_name,
2057+
display_name=deployed_model_display_name,
2058+
service_account=service_account,
2059+
enable_access_logging=enable_access_logging,
2060+
disable_container_logging=disable_container_logging,
2061+
)
20232062

20242063
if system_labels:
20252064
deployed_model.system_labels = system_labels
@@ -2066,19 +2105,19 @@ def _deploy_call(
20662105
_LOGGER.info(f"Using default machine_type: {machine_type}")
20672106

20682107
if use_dedicated_resources:
2069-
dedicated_resources = gca_machine_resources_compat.DedicatedResources(
2108+
dedicated_resources = gca_machine_resources.DedicatedResources(
20702109
min_replica_count=min_replica_count,
20712110
max_replica_count=max_replica_count,
20722111
spot=spot,
20732112
required_replica_count=required_replica_count,
20742113
)
20752114

2076-
machine_spec = gca_machine_resources_compat.MachineSpec(
2115+
machine_spec = gca_machine_resources.MachineSpec(
20772116
machine_type=machine_type
20782117
)
20792118

20802119
if autoscaling_target_cpu_utilization:
2081-
autoscaling_metric_spec = gca_machine_resources_compat.AutoscalingMetricSpec(
2120+
autoscaling_metric_spec = gca_machine_resources.AutoscalingMetricSpec(
20822121
metric_name="aiplatform.googleapis.com/prediction/online/cpu/utilization",
20832122
target=autoscaling_target_cpu_utilization,
20842123
)
@@ -2092,17 +2131,20 @@ def _deploy_call(
20922131
machine_spec.accelerator_count = accelerator_count
20932132

20942133
if autoscaling_target_accelerator_duty_cycle:
2095-
autoscaling_metric_spec = gca_machine_resources_compat.AutoscalingMetricSpec(
2134+
autoscaling_metric_spec = gca_machine_resources.AutoscalingMetricSpec(
20962135
metric_name="aiplatform.googleapis.com/prediction/online/accelerator/duty_cycle",
20972136
target=autoscaling_target_accelerator_duty_cycle,
20982137
)
20992138
dedicated_resources.autoscaling_metric_specs.extend(
21002139
[autoscaling_metric_spec]
21012140
)
21022141

2142+
if gpu_partition_size:
2143+
machine_spec.gpu_partition_size = gpu_partition_size
2144+
21032145
if autoscaling_target_request_count_per_minute:
21042146
autoscaling_metric_spec = (
2105-
gca_machine_resources_compat.AutoscalingMetricSpec(
2147+
gca_machine_resources.AutoscalingMetricSpec(
21062148
metric_name=(
21072149
"aiplatform.googleapis.com/prediction/online/"
21082150
"request_count"
@@ -2115,7 +2157,7 @@ def _deploy_call(
21152157
)
21162158

21172159
if autoscaling_target_pubsub_num_undelivered_messages:
2118-
autoscaling_metric_spec = gca_machine_resources_compat.AutoscalingMetricSpec(
2160+
autoscaling_metric_spec = gca_machine_resources.AutoscalingMetricSpec(
21192161
metric_name=(
21202162
"pubsub.googleapis.com/subscription/"
21212163
"num_undelivered_messages"
@@ -2141,14 +2183,14 @@ def _deploy_call(
21412183
deployed_model.dedicated_resources = dedicated_resources
21422184
if fast_tryout_enabled:
21432185
deployed_model.faster_deployment_config = (
2144-
gca_endpoint_compat.FasterDeploymentConfig(
2186+
gca_endpoint.FasterDeploymentConfig(
21452187
fast_tryout_enabled=fast_tryout_enabled
21462188
)
21472189
)
21482190

21492191
elif supports_automatic_resources:
21502192
deployed_model.automatic_resources = (
2151-
gca_machine_resources_compat.AutomaticResources(
2193+
gca_machine_resources.AutomaticResources(
21522194
min_replica_count=min_replica_count,
21532195
max_replica_count=max_replica_count,
21542196
)

tests/unit/aiplatform/test_endpoints.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
deployment_resource_pool_v1 as gca_deployment_resource_pool_v1,
4848
deployment_resource_pool_v1beta1 as gca_deployment_resource_pool_v1beta1,
4949
encryption_spec as gca_encryption_spec,
50+
encryption_spec_v1beta1 as gca_encryption_spec_v1beta1,
5051
endpoint_service_v1beta1 as gca_endpoint_service_v1beta1,
5152
endpoint_service as gca_endpoint_service,
5253
endpoint_v1beta1 as gca_endpoint_v1beta1,
@@ -134,6 +135,7 @@
134135
_TEST_MACHINE_TYPE = "n1-standard-32"
135136
_TEST_ACCELERATOR_TYPE = "NVIDIA_TESLA_P100"
136137
_TEST_ACCELERATOR_COUNT = 2
138+
_TEST_GPU_PARTITION_SIZE = "1g.10gb"
137139

138140
_TEST_METRIC_NAME_CPU_UTILIZATION = (
139141
"aiplatform.googleapis.com/prediction/online/cpu/utilization"
@@ -216,6 +218,9 @@
216218
_TEST_ENCRYPTION_SPEC = gca_encryption_spec.EncryptionSpec(
217219
kms_key_name=_TEST_ENCRYPTION_KEY_NAME
218220
)
221+
_TEST_ENCRYPTION_SPEC_V1BETA1 = gca_encryption_spec_v1beta1.EncryptionSpec(
222+
kms_key_name=_TEST_ENCRYPTION_KEY_NAME
223+
)
219224

220225
_TEST_ENDPOINT_GAPIC = gca_endpoint.Endpoint(
221226
display_name=_TEST_DISPLAY_NAME, name=_TEST_ENDPOINT_NAME
@@ -304,6 +309,19 @@ def get_endpoint_mock():
304309
yield get_endpoint_mock
305310

306311

312+
@pytest.fixture
313+
def get_endpoint_v1beta1_mock():
314+
with mock.patch.object(
315+
endpoint_service_client_v1beta1.EndpointServiceClient, "get_endpoint"
316+
) as get_endpoint_mock:
317+
get_endpoint_mock.return_value = gca_endpoint_v1beta1.Endpoint(
318+
display_name=_TEST_DISPLAY_NAME,
319+
name=_TEST_ENDPOINT_NAME,
320+
encryption_spec=_TEST_ENCRYPTION_SPEC_V1BETA1,
321+
)
322+
yield get_endpoint_mock
323+
324+
307325
@pytest.fixture
308326
def get_empty_endpoint_mock():
309327
with mock.patch.object(
@@ -458,6 +476,25 @@ def deploy_model_mock():
458476
yield deploy_model_mock
459477

460478

479+
@pytest.fixture
480+
def deploy_model_mock_v1beta1():
481+
with mock.patch.object(
482+
endpoint_service_client_v1beta1.EndpointServiceClient, "deploy_model"
483+
) as deploy_model_mock:
484+
deployed_model = gca_endpoint_v1beta1.DeployedModel(
485+
model=_TEST_MODEL_NAME,
486+
display_name=_TEST_DISPLAY_NAME,
487+
)
488+
deploy_model_lro_mock = mock.Mock(ga_operation.Operation)
489+
deploy_model_lro_mock.result.return_value = (
490+
gca_endpoint_service_v1beta1.DeployModelResponse(
491+
deployed_model=deployed_model,
492+
)
493+
)
494+
deploy_model_mock.return_value = deploy_model_lro_mock
495+
yield deploy_model_mock
496+
497+
461498
@pytest.fixture
462499
def preview_deploy_model_mock():
463500
with mock.patch.object(
@@ -1925,6 +1962,57 @@ def test_deploy_with_dedicated_resources(self, deploy_model_mock, sync):
19251962
timeout=None,
19261963
)
19271964

1965+
@pytest.mark.usefixtures(
1966+
"get_endpoint_mock", "get_endpoint_v1beta1_mock", "get_model_mock"
1967+
)
1968+
@pytest.mark.parametrize("sync", [True])
1969+
def test_deploy_with_dedicated_resources_and_gpu_partition_size(
1970+
self, deploy_model_mock_v1beta1, sync
1971+
):
1972+
test_endpoint = models.Endpoint(_TEST_ENDPOINT_NAME)
1973+
test_model = models.Model(_TEST_ID)
1974+
test_model._gca_resource.supported_deployment_resources_types.append(
1975+
aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES
1976+
)
1977+
test_endpoint.deploy(
1978+
model=test_model,
1979+
machine_type=_TEST_MACHINE_TYPE,
1980+
accelerator_type=_TEST_ACCELERATOR_TYPE,
1981+
accelerator_count=_TEST_ACCELERATOR_COUNT,
1982+
gpu_partition_size=_TEST_GPU_PARTITION_SIZE,
1983+
service_account=_TEST_SERVICE_ACCOUNT,
1984+
sync=sync,
1985+
deploy_request_timeout=None,
1986+
)
1987+
1988+
if not sync:
1989+
test_endpoint.wait()
1990+
1991+
expected_machine_spec = gca_machine_resources_v1beta1.MachineSpec(
1992+
machine_type=_TEST_MACHINE_TYPE,
1993+
accelerator_type=_TEST_ACCELERATOR_TYPE,
1994+
accelerator_count=_TEST_ACCELERATOR_COUNT,
1995+
gpu_partition_size=_TEST_GPU_PARTITION_SIZE,
1996+
)
1997+
expected_dedicated_resources = gca_machine_resources_v1beta1.DedicatedResources(
1998+
machine_spec=expected_machine_spec,
1999+
min_replica_count=1,
2000+
max_replica_count=1,
2001+
)
2002+
expected_deployed_model = gca_endpoint_v1beta1.DeployedModel(
2003+
dedicated_resources=expected_dedicated_resources,
2004+
model=test_model.resource_name,
2005+
display_name=None,
2006+
service_account=_TEST_SERVICE_ACCOUNT,
2007+
)
2008+
deploy_model_mock_v1beta1.assert_called_once_with(
2009+
endpoint=test_endpoint.resource_name,
2010+
deployed_model=expected_deployed_model,
2011+
traffic_split={"0": 100},
2012+
metadata=(),
2013+
timeout=None,
2014+
)
2015+
19282016
@pytest.mark.usefixtures("get_endpoint_mock", "get_model_mock")
19292017
@pytest.mark.parametrize("sync", [True, False])
19302018
def test_deploy_with_autoscaling_target_cpu_utilization(

0 commit comments

Comments
 (0)