Skip to content

Commit 55ae3da

Browse files
[Feat] Add support for scaling down to zero in KEDA (#679)
* feat: keda scale down to zero Signed-off-by: Xiangfeng Zhu <[email protected]> * formatting Signed-off-by: Xiangfeng Zhu <[email protected]> * Change gauge to counter Signed-off-by: Xiangfeng Zhu <[email protected]> * Add note Signed-off-by: Xiangfeng Zhu <[email protected]> --------- Signed-off-by: Xiangfeng Zhu <[email protected]> Co-authored-by: Yuhan Liu <[email protected]>
1 parent f2d3d71 commit 55ae3da

File tree

6 files changed

+172
-15
lines changed

6 files changed

+172
-15
lines changed

observability/prom-adapter.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,14 @@ rules:
1818
matches: ""
1919
as: "vllm_num_requests_waiting"
2020
metricsQuery: sum by(namespace) (vllm:num_requests_waiting)
21+
22+
# Export num_incoming_requests_total by model name
23+
- seriesQuery: '{__name__=~"^vllm:num_incoming_requests_total$"}'
24+
resources:
25+
overrides:
26+
namespace:
27+
resource: "namespace"
28+
name:
29+
matches: ""
30+
as: "vllm_num_incoming_requests_total"
31+
metricsQuery: sum by(namespace, model) (vllm:num_incoming_requests_total)

src/vllm_router/service_discovery.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import time
2121
import uuid
2222
from dataclasses import dataclass
23-
from typing import Dict, List, Optional
23+
from typing import Dict, List, Optional, Set
2424

2525
import aiohttp
2626
import requests
@@ -372,6 +372,8 @@ def __init__(
372372
self.port = port
373373
self.available_engines: Dict[str, EndpointInfo] = {}
374374
self.available_engines_lock = threading.Lock()
375+
self.known_models: Set[str] = set()
376+
self.known_models_lock = threading.Lock()
375377
self.label_selector = label_selector
376378
self.watcher_timeout_seconds = watcher_timeout_seconds
377379
self.health_check_timeout_seconds = health_check_timeout_seconds
@@ -662,6 +664,10 @@ def _add_engine(
662664
# Store model information in the endpoint info
663665
self.available_engines[engine_name].model_info = model_info
664666

667+
# Track all models we've ever seen
668+
with self.known_models_lock:
669+
self.known_models.update(model_names)
670+
665671
def _delete_engine(self, engine_name: str):
666672
logger.info(f"Serving engine {engine_name} is deleted")
667673
with self.available_engines_lock:
@@ -758,6 +764,16 @@ async def initialize_client_sessions(self) -> None:
758764
timeout=aiohttp.ClientTimeout(total=None),
759765
)
760766

767+
def has_ever_seen_model(self, model_name: str) -> bool:
768+
"""Check if we've ever seen this model, even if currently scaled to zero."""
769+
with self.known_models_lock:
770+
return model_name in self.known_models
771+
772+
def get_known_models(self) -> Set[str]:
773+
"""Get all models that have ever been discovered."""
774+
with self.known_models_lock:
775+
return self.known_models.copy()
776+
761777

762778
class K8sServiceNameServiceDiscovery(ServiceDiscovery):
763779
def __init__(

src/vllm_router/services/metrics_service/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from prometheus_client import Gauge
1+
from prometheus_client import Counter, Gauge
22

33
# --- Prometheus Gauges ---
44
# Existing metrics
@@ -33,6 +33,11 @@
3333
num_decoding_requests = Gauge(
3434
"vllm:num_decoding_requests", "Number of Decoding Requests", ["server"]
3535
)
36+
num_incoming_requests_total = Counter(
37+
"vllm:num_incoming_requests",
38+
"Total valid incoming requests to router (including when no backends available).",
39+
["model"],
40+
)
3641

3742
# New metrics per dashboard update
3843
healthy_pods_total = Gauge(

src/vllm_router/services/request_service/request.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
except ImportError:
4848
semantic_cache_available = False
4949

50+
from vllm_router.services.metrics_service import num_incoming_requests_total
5051

5152
logger = init_logger(__name__)
5253

@@ -213,6 +214,11 @@ async def route_general_request(
213214
request_body = replace_model_in_request_body(request_json, requested_model)
214215
update_content_length(request, request_body)
215216

217+
# Check if model has ever been seen (even if currently scaled to zero)
218+
model_ever_existed = False
219+
if hasattr(service_discovery, "has_ever_seen_model"):
220+
model_ever_existed = service_discovery.has_ever_seen_model(requested_model)
221+
216222
if not request_endpoint:
217223
endpoints = list(
218224
filter(
@@ -234,13 +240,27 @@ async def route_general_request(
234240
)
235241
)
236242

243+
# Track all valid incoming requests
244+
num_incoming_requests_total.labels(model=requested_model).inc()
245+
237246
if not endpoints:
238-
return JSONResponse(
239-
status_code=400,
240-
content={
241-
"error": f"Model {requested_model} not found or vLLM engine is sleeping."
242-
},
243-
)
247+
if not model_ever_existed:
248+
return JSONResponse(
249+
status_code=404,
250+
content={
251+
"error": f"Model '{requested_model}' not found. Available models can be listed at /v1/models."
252+
},
253+
headers={"X-Request-Id": request_id},
254+
)
255+
else:
256+
# Model existed before but is now scaled to zero
257+
return JSONResponse(
258+
status_code=503,
259+
content={
260+
"error": f"Model '{requested_model}' is temporarily unavailable. Please try again later."
261+
},
262+
headers={"X-Request-Id": request_id},
263+
)
244264

245265
logger.debug(f"Routing request {request_id} for model: {requested_model}")
246266
if request_endpoint:

tutorials/20-keda-autoscaling.md

Lines changed: 111 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@ This tutorial shows you how to automatically scale a vLLM deployment using [KEDA
1616
* [4. Verify Metric Export](#4-verify-metric-export)
1717
* [5. Configure the ScaledObject](#5-configure-the-scaledobject)
1818
* [6. Test Autoscaling](#6-test-autoscaling)
19-
* [7. Cleanup](#7-cleanup)
19+
* [7. Scale down to zero](#7-scale-down-to-zero)
20+
* [8. Cleanup](#8-cleanup)
2021
* [Additional Resources](#additional-resources)
2122

2223
---
2324

25+
> **Note**: This tutorial only supports non-disaggregated prefill request autoscaling.
26+
2427
## Prerequisites
2528

2629
* A working vLLM deployment on Kubernetes (see [01-minimal-helm-installation](01-minimal-helm-installation.md))
@@ -99,7 +102,7 @@ This means that at the given timestamp, there were 0 pending requests in the que
99102

100103
### 5. Configure the ScaledObject
101104

102-
The following `ScaledObject` configuration is provided in `tutorials/assets/values-19-keda.yaml`. Review its contents:
105+
The following `ScaledObject` configuration is provided in `tutorials/assets/values-20-keda.yaml`. Review its contents:
103106

104107
```yaml
105108
apiVersion: keda.sh/v1alpha1
@@ -113,7 +116,7 @@ spec:
113116
minReplicaCount: 1
114117
maxReplicaCount: 2
115118
pollingInterval: 15
116-
cooldownPeriod: 30
119+
cooldownPeriod: 360
117120
triggers:
118121
- type: prometheus
119122
metadata:
@@ -127,7 +130,7 @@ Apply the ScaledObject:
127130
128131
```bash
129132
cd ../tutorials
130-
kubectl apply -f assets/values-19-keda.yaml
133+
kubectl apply -f assets/values-20-keda.yaml
131134
```
132135

133136
This tells KEDA to:
@@ -172,12 +175,114 @@ Within a few minutes, the `REPLICAS` value should increase to 2.
172175

173176
---
174177

175-
### 7. Cleanup
178+
### 7. Scale Down to Zero
179+
180+
Sometimes you want to scale down to zero replicas when there's no traffic. This is a unique capability of KEDA compared to Kubernetes' HPA, which always maintains at least one replica. Scale-to-zero is particularly useful for:
181+
182+
* **Cost optimization**: Eliminate resource usage during idle periods
183+
* **Resource efficiency**: Free up GPU resources for other workloads
184+
* **Cold start scenarios**: Scale up only when requests arrive
185+
186+
We provide this capability through a dual-trigger configuration. To configure it, modify the `tutorials/assets/values-20-keda.yaml`:
187+
188+
```yaml
189+
# KEDA ScaledObject for vLLM deployment with scale-to-zero capability
190+
# This configuration enables automatic scaling of vLLM pods based on queue length metrics
191+
apiVersion: keda.sh/v1alpha1
192+
kind: ScaledObject
193+
metadata:
194+
name: vllm-scaledobject
195+
namespace: default
196+
spec:
197+
scaleTargetRef:
198+
name: vllm-llama3-deployment-vllm
199+
minReplicaCount: 0 # Allow scaling down to zero
200+
maxReplicaCount: 2
201+
# How often KEDA should check the metrics (in seconds)
202+
pollingInterval: 15
203+
# How long to wait before scaling down after scaling up (in seconds)
204+
cooldownPeriod: 360
205+
# Scaling triggers configuration
206+
triggers:
207+
# Trigger 1: Queue-based scaling
208+
- type: prometheus
209+
metadata:
210+
# Prometheus server address within the cluster
211+
serverAddress: http://prometheus-operated.monitoring.svc:9090
212+
# Name of the metric to monitor
213+
metricName: vllm:num_requests_waiting
214+
# Prometheus query to fetch the metric
215+
query: vllm:num_requests_waiting
216+
# Threshold value that triggers scaling
217+
# When queue length exceeds this value, KEDA will scale up
218+
threshold: '5'
219+
# Trigger 2: Traffic-based "keepalive" - prevents scale-to-zero when there's active traffic
220+
- type: prometheus
221+
metadata:
222+
serverAddress: http://prometheus-operated.monitoring.svc:9090
223+
metricName: vllm:incoming_keepalive
224+
# This query returns 1 if there's any incoming traffic in the last minute, 0 otherwise
225+
query: sum(rate(vllm:num_incoming_requests_total[1m]) > bool 0)
226+
threshold: "1"
227+
```
228+
229+
**How the dual-trigger system works:**
230+
231+
1. **Queue trigger**: Scales up when `vllm:num_requests_waiting > 5`
232+
2. **Traffic trigger**: Prevents scale-to-zero when there's active incoming traffic (rate > 0 in the last minute)
233+
3. **Scale-to-zero**: Only occurs when both triggers are below their thresholds (no queue AND no traffic)
234+
235+
Apply the updated configuration:
236+
237+
```bash
238+
kubectl apply -f assets/values-20-keda.yaml
239+
```
240+
241+
**Test the scale-to-zero behavior:**
242+
243+
1. **Monitor the pods:**
244+
245+
```bash
246+
kubectl get pods -w
247+
```
248+
249+
2. **Wait for scale-down:**
250+
Within a few minutes, you should see the backend pod get terminated, meaning KEDA decided to scale down to zero.
251+
252+
3. **Test scale-up from zero:**
253+
254+
```bash
255+
kubectl port-forward svc/vllm-router-service 30080:80
256+
```
257+
258+
In a separate terminal:
259+
260+
```bash
261+
curl -X POST http://localhost:30080/v1/completions \
262+
-H "Content-Type: application/json" \
263+
-d '{
264+
"model": "meta-llama/Llama-3.1-8B-Instruct",
265+
"prompt": "Once upon a time,",
266+
"max_tokens": 10
267+
}'
268+
```
269+
270+
You should initially get a HTTP 503 error saying the service is temporarily unavailable. However, within a few minutes, you should see a fresh pod being brought up and the same query should succeed.
271+
272+
**Expected behavior:**
273+
274+
* **Scale down**: Pods terminate when there's no traffic and no queued requests
275+
* **Scale up**: New pods start when requests arrive, even from zero replicas
276+
* **Cold start delay**: First request after scale-to-zero will experience a delay while the pod initializes
277+
278+
---
279+
280+
### 8. Cleanup
176281

177282
To remove KEDA configuration and observability components:
178283

179284
```bash
180-
kubectl delete -f assets/values-19-keda.yaml
285+
kubectl delete -f assets/values-20-keda.yaml
181286
helm uninstall keda -n keda
182287
kubectl delete namespace keda
183288

tutorials/assets/values-20-keda.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ spec:
1313
# How often KEDA should check the metrics (in seconds)
1414
pollingInterval: 15
1515
# How long to wait before scaling down after scaling up (in seconds)
16-
cooldownPeriod: 30
16+
cooldownPeriod: 360
1717
# Scaling triggers configuration
1818
triggers:
1919
- type: prometheus

0 commit comments

Comments
 (0)