62
62
deployment_resource_pool as gca_deployment_resource_pool_compat ,
63
63
deployed_model_ref as gca_deployed_model_ref_compat ,
64
64
encryption_spec as gca_encryption_spec ,
65
+ endpoint_v1beta1 as gca_endpoint_v1beta1_compat ,
65
66
endpoint as gca_endpoint_compat ,
66
67
explanation as gca_explanation_compat ,
67
68
io as gca_io_compat ,
69
+ machine_resources_v1beta1 as gca_machine_resources_v1beta1_compat ,
68
70
machine_resources as gca_machine_resources_compat ,
69
71
model as gca_model_compat ,
70
72
model_service as gca_model_service_compat ,
@@ -1352,6 +1354,7 @@ def deploy(
1352
1354
max_replica_count : int = 1 ,
1353
1355
accelerator_type : Optional [str ] = None ,
1354
1356
accelerator_count : Optional [int ] = None ,
1357
+ gpu_partition_size : Optional [str ] = None ,
1355
1358
tpu_topology : Optional [str ] = None ,
1356
1359
service_account : Optional [str ] = None ,
1357
1360
explanation_metadata : Optional [aiplatform .explain .ExplanationMetadata ] = None ,
@@ -1425,6 +1428,8 @@ def deploy(
1425
1428
NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
1426
1429
accelerator_count (int):
1427
1430
Optional. The number of accelerators to attach to a worker replica.
1431
+ gpu_partition_size (str):
1432
+ Optional. The GPU partition Size for Nvidia MIG.
1428
1433
tpu_topology (str):
1429
1434
Optional. The TPU topology to use for the DeployedModel.
1430
1435
Required for CloudTPU multihost deployments.
@@ -1537,6 +1542,7 @@ def deploy(
1537
1542
max_replica_count = max_replica_count ,
1538
1543
accelerator_type = accelerator_type ,
1539
1544
accelerator_count = accelerator_count ,
1545
+ gpu_partition_size = gpu_partition_size ,
1540
1546
tpu_topology = tpu_topology ,
1541
1547
reservation_affinity_type = reservation_affinity_type ,
1542
1548
reservation_affinity_key = reservation_affinity_key ,
@@ -1572,6 +1578,7 @@ def _deploy(
1572
1578
max_replica_count : int = 1 ,
1573
1579
accelerator_type : Optional [str ] = None ,
1574
1580
accelerator_count : Optional [int ] = None ,
1581
+ gpu_partition_size : Optional [str ] = None ,
1575
1582
tpu_topology : Optional [str ] = None ,
1576
1583
reservation_affinity_type : Optional [str ] = None ,
1577
1584
reservation_affinity_key : Optional [str ] = None ,
@@ -1642,6 +1649,8 @@ def _deploy(
1642
1649
NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
1643
1650
accelerator_count (int):
1644
1651
Optional. The number of accelerators to attach to a worker replica.
1652
+ gpu_partition_size (str):
1653
+ Optional. The GPU partition size for NVidia MIG.
1645
1654
tpu_topology (str):
1646
1655
Optional. The TPU topology to use for the DeployedModel.
1647
1656
Required for CloudTPU multihost deployments.
@@ -1738,6 +1747,7 @@ def _deploy(
1738
1747
max_replica_count = max_replica_count ,
1739
1748
accelerator_type = accelerator_type ,
1740
1749
accelerator_count = accelerator_count ,
1750
+ gpu_partition_size = gpu_partition_size ,
1741
1751
tpu_topology = tpu_topology ,
1742
1752
reservation_affinity_type = reservation_affinity_type ,
1743
1753
reservation_affinity_key = reservation_affinity_key ,
@@ -1780,6 +1790,7 @@ def _deploy_call(
1780
1790
max_replica_count : int = 1 ,
1781
1791
accelerator_type : Optional [str ] = None ,
1782
1792
accelerator_count : Optional [int ] = None ,
1793
+ gpu_partition_size : Optional [str ] = None ,
1783
1794
tpu_topology : Optional [str ] = None ,
1784
1795
reservation_affinity_type : Optional [str ] = None ,
1785
1796
reservation_affinity_key : Optional [str ] = None ,
@@ -1859,6 +1870,8 @@ def _deploy_call(
1859
1870
NVIDIA_TESLA_V100, NVIDIA_TESLA_P4, NVIDIA_TESLA_T4
1860
1871
accelerator_count (int):
1861
1872
Optional. The number of accelerators to attach to a worker replica.
1873
+ gpu_partition_size (str):
1874
+ Optional. The GPU partition Size for Nvidia MIG.
1862
1875
tpu_topology (str):
1863
1876
Optional. The TPU topology to use for the DeployedModel.
1864
1877
Required for CloudTPU multihost deployments.
@@ -1942,15 +1955,35 @@ def _deploy_call(
1942
1955
ValueError: If both `explanation_spec` and `deployment_resource_pool`
1943
1956
are present.
1944
1957
"""
1958
+ # The two features are incompatible due to API versioning issue.
1959
+ # TODO(b/436626409) after adding the disable_container_logging to v1 proto
1960
+ # remove the incomaptiblity check.
1961
+ if gpu_partition_size and disable_container_logging :
1962
+ _LOGGER .warning (
1963
+ "Cannot set both gpu_partition_size and disable_container_logging. disable_container_logging will be ignored."
1964
+ )
1965
+
1966
+ gca_endpoint = gca_endpoint_compat
1967
+ gca_machine_resources = gca_machine_resources_compat
1968
+ if gpu_partition_size :
1969
+ gca_machine_resources = gca_machine_resources_v1beta1_compat
1970
+ gca_endpoint = gca_endpoint_v1beta1_compat
1971
+ api_client = api_client .select_version ("v1beta1" )
1945
1972
service_account = service_account or initializer .global_config .service_account
1946
1973
1947
1974
if deployment_resource_pool :
1948
- deployed_model = gca_endpoint_compat .DeployedModel (
1975
+ deployed_model = gca_endpoint .DeployedModel (
1949
1976
model = model .versioned_resource_name ,
1950
1977
display_name = deployed_model_display_name ,
1951
1978
service_account = service_account ,
1952
- disable_container_logging = disable_container_logging ,
1953
1979
)
1980
+ if not gpu_partition_size :
1981
+ deployed_model = gca_endpoint .DeployedModel (
1982
+ model = model .versioned_resource_name ,
1983
+ display_name = deployed_model_display_name ,
1984
+ service_account = service_account ,
1985
+ disable_container_logging = disable_container_logging ,
1986
+ )
1954
1987
1955
1988
if system_labels :
1956
1989
deployed_model .system_labels = system_labels
@@ -2012,14 +2045,20 @@ def _deploy_call(
2012
2045
"Both `accelerator_type` and `accelerator_count` should be set "
2013
2046
"when specifying autoscaling_target_accelerator_duty_cycle`"
2014
2047
)
2015
-
2016
- deployed_model = gca_endpoint_compat .DeployedModel (
2048
+ deployed_model = gca_endpoint .DeployedModel (
2017
2049
model = model .versioned_resource_name ,
2018
2050
display_name = deployed_model_display_name ,
2019
2051
service_account = service_account ,
2020
2052
enable_access_logging = enable_access_logging ,
2021
- disable_container_logging = disable_container_logging ,
2022
2053
)
2054
+ if not gpu_partition_size :
2055
+ deployed_model = gca_endpoint .DeployedModel (
2056
+ model = model .versioned_resource_name ,
2057
+ display_name = deployed_model_display_name ,
2058
+ service_account = service_account ,
2059
+ enable_access_logging = enable_access_logging ,
2060
+ disable_container_logging = disable_container_logging ,
2061
+ )
2023
2062
2024
2063
if system_labels :
2025
2064
deployed_model .system_labels = system_labels
@@ -2066,19 +2105,19 @@ def _deploy_call(
2066
2105
_LOGGER .info (f"Using default machine_type: { machine_type } " )
2067
2106
2068
2107
if use_dedicated_resources :
2069
- dedicated_resources = gca_machine_resources_compat .DedicatedResources (
2108
+ dedicated_resources = gca_machine_resources .DedicatedResources (
2070
2109
min_replica_count = min_replica_count ,
2071
2110
max_replica_count = max_replica_count ,
2072
2111
spot = spot ,
2073
2112
required_replica_count = required_replica_count ,
2074
2113
)
2075
2114
2076
- machine_spec = gca_machine_resources_compat .MachineSpec (
2115
+ machine_spec = gca_machine_resources .MachineSpec (
2077
2116
machine_type = machine_type
2078
2117
)
2079
2118
2080
2119
if autoscaling_target_cpu_utilization :
2081
- autoscaling_metric_spec = gca_machine_resources_compat .AutoscalingMetricSpec (
2120
+ autoscaling_metric_spec = gca_machine_resources .AutoscalingMetricSpec (
2082
2121
metric_name = "aiplatform.googleapis.com/prediction/online/cpu/utilization" ,
2083
2122
target = autoscaling_target_cpu_utilization ,
2084
2123
)
@@ -2092,17 +2131,20 @@ def _deploy_call(
2092
2131
machine_spec .accelerator_count = accelerator_count
2093
2132
2094
2133
if autoscaling_target_accelerator_duty_cycle :
2095
- autoscaling_metric_spec = gca_machine_resources_compat .AutoscalingMetricSpec (
2134
+ autoscaling_metric_spec = gca_machine_resources .AutoscalingMetricSpec (
2096
2135
metric_name = "aiplatform.googleapis.com/prediction/online/accelerator/duty_cycle" ,
2097
2136
target = autoscaling_target_accelerator_duty_cycle ,
2098
2137
)
2099
2138
dedicated_resources .autoscaling_metric_specs .extend (
2100
2139
[autoscaling_metric_spec ]
2101
2140
)
2102
2141
2142
+ if gpu_partition_size :
2143
+ machine_spec .gpu_partition_size = gpu_partition_size
2144
+
2103
2145
if autoscaling_target_request_count_per_minute :
2104
2146
autoscaling_metric_spec = (
2105
- gca_machine_resources_compat .AutoscalingMetricSpec (
2147
+ gca_machine_resources .AutoscalingMetricSpec (
2106
2148
metric_name = (
2107
2149
"aiplatform.googleapis.com/prediction/online/"
2108
2150
"request_count"
@@ -2115,7 +2157,7 @@ def _deploy_call(
2115
2157
)
2116
2158
2117
2159
if autoscaling_target_pubsub_num_undelivered_messages :
2118
- autoscaling_metric_spec = gca_machine_resources_compat .AutoscalingMetricSpec (
2160
+ autoscaling_metric_spec = gca_machine_resources .AutoscalingMetricSpec (
2119
2161
metric_name = (
2120
2162
"pubsub.googleapis.com/subscription/"
2121
2163
"num_undelivered_messages"
@@ -2141,14 +2183,14 @@ def _deploy_call(
2141
2183
deployed_model .dedicated_resources = dedicated_resources
2142
2184
if fast_tryout_enabled :
2143
2185
deployed_model .faster_deployment_config = (
2144
- gca_endpoint_compat .FasterDeploymentConfig (
2186
+ gca_endpoint .FasterDeploymentConfig (
2145
2187
fast_tryout_enabled = fast_tryout_enabled
2146
2188
)
2147
2189
)
2148
2190
2149
2191
elif supports_automatic_resources :
2150
2192
deployed_model .automatic_resources = (
2151
- gca_machine_resources_compat .AutomaticResources (
2193
+ gca_machine_resources .AutomaticResources (
2152
2194
min_replica_count = min_replica_count ,
2153
2195
max_replica_count = max_replica_count ,
2154
2196
)
0 commit comments