[Feat] Add support for scaling down to zero in KEDA (#679)

Romero027 · YuhanLiu11 · web-flow · commit 55ae3da2250e · 2025-09-09T14:54:46.000-07:00
* feat: keda scale down to zero

Signed-off-by: Xiangfeng Zhu &lt;xzhu0027@gmail.com&gt;

* formatting

Signed-off-by: Xiangfeng Zhu &lt;xzhu0027@gmail.com&gt;

* Change gauge to counter

Signed-off-by: Xiangfeng Zhu &lt;xzhu0027@gmail.com&gt;

* Add note

Signed-off-by: Xiangfeng Zhu &lt;xzhu0027@gmail.com&gt;

---------

Signed-off-by: Xiangfeng Zhu &lt;xzhu0027@gmail.com&gt;
Co-authored-by: Yuhan Liu &lt;32589867+YuhanLiu11@users.noreply.github.com&gt;
diff --git a/observability/prom-adapter.yaml b/observability/prom-adapter.yaml
@@ -18,3 +18,14 @@ rules:
       matches: ""
       as: "vllm_num_requests_waiting"
     metricsQuery: sum by(namespace) (vllm:num_requests_waiting)
+
+  # Export num_incoming_requests_total by model name
+  - seriesQuery: '{__name__=~"^vllm:num_incoming_requests_total$"}'
+    resources:
+      overrides:
+        namespace:
+          resource: "namespace"
+    name:
+      matches: ""
+      as: "vllm_num_incoming_requests_total"
+    metricsQuery: sum by(namespace, model) (vllm:num_incoming_requests_total)
diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
@@ -20,7 +20,7 @@
 import time
 import uuid
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Set
 
 import aiohttp
 import requests
@@ -372,6 +372,8 @@ def __init__(
         self.port = port
         self.available_engines: Dict[str, EndpointInfo] = {}
         self.available_engines_lock = threading.Lock()
+        self.known_models: Set[str] = set()
+        self.known_models_lock = threading.Lock()
         self.label_selector = label_selector
         self.watcher_timeout_seconds = watcher_timeout_seconds
         self.health_check_timeout_seconds = health_check_timeout_seconds
@@ -662,6 +664,10 @@ def _add_engine(
             # Store model information in the endpoint info
             self.available_engines[engine_name].model_info = model_info
 
+        # Track all models we've ever seen
+        with self.known_models_lock:
+            self.known_models.update(model_names)
+
     def _delete_engine(self, engine_name: str):
         logger.info(f"Serving engine {engine_name} is deleted")
         with self.available_engines_lock:
@@ -758,6 +764,16 @@ async def initialize_client_sessions(self) -> None:
                         timeout=aiohttp.ClientTimeout(total=None),
                     )
 
+    def has_ever_seen_model(self, model_name: str) -> bool:
+        """Check if we've ever seen this model, even if currently scaled to zero."""
+        with self.known_models_lock:
+            return model_name in self.known_models
+
+    def get_known_models(self) -> Set[str]:
+        """Get all models that have ever been discovered."""
+        with self.known_models_lock:
+            return self.known_models.copy()
+
 
 class K8sServiceNameServiceDiscovery(ServiceDiscovery):
     def __init__(
diff --git a/src/vllm_router/services/metrics_service/__init__.py b/src/vllm_router/services/metrics_service/__init__.py
@@ -1,4 +1,4 @@
-from prometheus_client import Gauge
+from prometheus_client import Counter, Gauge
 
 # --- Prometheus Gauges ---
 # Existing metrics
@@ -33,6 +33,11 @@
 num_decoding_requests = Gauge(
     "vllm:num_decoding_requests", "Number of Decoding Requests", ["server"]
 )
+num_incoming_requests_total = Counter(
+    "vllm:num_incoming_requests",
+    "Total valid incoming requests to router (including when no backends available).",
+    ["model"],
+)
 
 # New metrics per dashboard update
 healthy_pods_total = Gauge(
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
@@ -47,6 +47,7 @@
 except ImportError:
     semantic_cache_available = False
 
+from vllm_router.services.metrics_service import num_incoming_requests_total
 
 logger = init_logger(__name__)
 
@@ -213,6 +214,11 @@ async def route_general_request(
         request_body = replace_model_in_request_body(request_json, requested_model)
         update_content_length(request, request_body)
 
+    # Check if model has ever been seen (even if currently scaled to zero)
+    model_ever_existed = False
+    if hasattr(service_discovery, "has_ever_seen_model"):
+        model_ever_existed = service_discovery.has_ever_seen_model(requested_model)
+
     if not request_endpoint:
         endpoints = list(
             filter(
@@ -234,13 +240,27 @@ async def route_general_request(
             )
         )
 
+    # Track all valid incoming requests
+    num_incoming_requests_total.labels(model=requested_model).inc()
+
     if not endpoints:
-        return JSONResponse(
-            status_code=400,
-            content={
-                "error": f"Model {requested_model} not found or vLLM engine is sleeping."
-            },
-        )
+        if not model_ever_existed:
+            return JSONResponse(
+                status_code=404,
+                content={
+                    "error": f"Model '{requested_model}' not found. Available models can be listed at /v1/models."
+                },
+                headers={"X-Request-Id": request_id},
+            )
+        else:
+            # Model existed before but is now scaled to zero
+            return JSONResponse(
+                status_code=503,
+                content={
+                    "error": f"Model '{requested_model}' is temporarily unavailable. Please try again later."
+                },
+                headers={"X-Request-Id": request_id},
+            )
 
     logger.debug(f"Routing request {request_id} for model: {requested_model}")
     if request_endpoint:
diff --git a/tutorials/20-keda-autoscaling.md b/tutorials/20-keda-autoscaling.md
@@ -16,11 +16,14 @@ This tutorial shows you how to automatically scale a vLLM deployment using [KEDA
   * [4. Verify Metric Export](#4-verify-metric-export)
   * [5. Configure the ScaledObject](#5-configure-the-scaledobject)
   * [6. Test Autoscaling](#6-test-autoscaling)
-  * [7. Cleanup](#7-cleanup)
+  * [7. Scale down to zero](#7-scale-down-to-zero)
+  * [8. Cleanup](#8-cleanup)
 * [Additional Resources](#additional-resources)
 
 ---
 
+> **Note**: This tutorial only supports non-disaggregated prefill request autoscaling.
+
 ## Prerequisites
 
 * A working vLLM deployment on Kubernetes (see [01-minimal-helm-installation](01-minimal-helm-installation.md))
@@ -99,7 +102,7 @@ This means that at the given timestamp, there were 0 pending requests in the que
 
 ### 5. Configure the ScaledObject
 
-The following `ScaledObject` configuration is provided in `tutorials/assets/values-19-keda.yaml`. Review its contents:
+The following `ScaledObject` configuration is provided in `tutorials/assets/values-20-keda.yaml`. Review its contents:
 
 ```yaml
 apiVersion: keda.sh/v1alpha1
@@ -113,7 +116,7 @@ spec:
   minReplicaCount: 1
   maxReplicaCount: 2
   pollingInterval: 15
-  cooldownPeriod: 30
+  cooldownPeriod: 360
   triggers:
     - type: prometheus
       metadata:
@@ -127,7 +130,7 @@ Apply the ScaledObject:
 
 ```bash
 cd ../tutorials
-kubectl apply -f assets/values-19-keda.yaml
+kubectl apply -f assets/values-20-keda.yaml
 ```
 
 This tells KEDA to:
@@ -172,12 +175,114 @@ Within a few minutes, the `REPLICAS` value should increase to 2.
 
 ---
 
-### 7. Cleanup
+### 7. Scale Down to Zero
+
+Sometimes you want to scale down to zero replicas when there's no traffic. This is a unique capability of KEDA compared to Kubernetes' HPA, which always maintains at least one replica. Scale-to-zero is particularly useful for:
+
+* **Cost optimization**: Eliminate resource usage during idle periods
+* **Resource efficiency**: Free up GPU resources for other workloads
+* **Cold start scenarios**: Scale up only when requests arrive
+
+We provide this capability through a dual-trigger configuration. To configure it, modify the `tutorials/assets/values-20-keda.yaml`:
+
+```yaml
+# KEDA ScaledObject for vLLM deployment with scale-to-zero capability
+# This configuration enables automatic scaling of vLLM pods based on queue length metrics
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: vllm-scaledobject
+  namespace: default
+spec:
+  scaleTargetRef:
+    name: vllm-llama3-deployment-vllm
+  minReplicaCount: 0  # Allow scaling down to zero
+  maxReplicaCount: 2
+  # How often KEDA should check the metrics (in seconds)
+  pollingInterval: 15
+  # How long to wait before scaling down after scaling up (in seconds)
+  cooldownPeriod: 360
+  # Scaling triggers configuration
+  triggers:
+    # Trigger 1: Queue-based scaling
+    - type: prometheus
+      metadata:
+        # Prometheus server address within the cluster
+        serverAddress: http://prometheus-operated.monitoring.svc:9090
+        # Name of the metric to monitor
+        metricName: vllm:num_requests_waiting
+        # Prometheus query to fetch the metric
+        query: vllm:num_requests_waiting
+        # Threshold value that triggers scaling
+        # When queue length exceeds this value, KEDA will scale up
+        threshold: '5'
+    # Trigger 2: Traffic-based "keepalive" - prevents scale-to-zero when there's active traffic
+    - type: prometheus
+      metadata:
+        serverAddress: http://prometheus-operated.monitoring.svc:9090
+        metricName: vllm:incoming_keepalive
+        # This query returns 1 if there's any incoming traffic in the last minute, 0 otherwise
+        query: sum(rate(vllm:num_incoming_requests_total[1m]) > bool 0)
+        threshold: "1"
+```
+
+**How the dual-trigger system works:**
+
+1. **Queue trigger**: Scales up when `vllm:num_requests_waiting > 5`
+2. **Traffic trigger**: Prevents scale-to-zero when there's active incoming traffic (rate > 0 in the last minute)
+3. **Scale-to-zero**: Only occurs when both triggers are below their thresholds (no queue AND no traffic)
+
+Apply the updated configuration:
+
+```bash
+kubectl apply -f assets/values-20-keda.yaml
+```
+
+**Test the scale-to-zero behavior:**
+
+1. **Monitor the pods:**
+
+   ```bash
+   kubectl get pods -w
+   ```
+
+2. **Wait for scale-down:**
+   Within a few minutes, you should see the backend pod get terminated, meaning KEDA decided to scale down to zero.
+
+3. **Test scale-up from zero:**
+
+   ```bash
+   kubectl port-forward svc/vllm-router-service 30080:80
+   ```
+
+   In a separate terminal:
+
+   ```bash
+   curl -X POST http://localhost:30080/v1/completions \
+     -H "Content-Type: application/json" \
+     -d '{
+       "model": "meta-llama/Llama-3.1-8B-Instruct",
+       "prompt": "Once upon a time,",
+       "max_tokens": 10
+     }'
+   ```
+
+   You should initially get a HTTP 503 error saying the service is temporarily unavailable. However, within a few minutes, you should see a fresh pod being brought up and the same query should succeed.
+
+**Expected behavior:**
+
+* **Scale down**: Pods terminate when there's no traffic and no queued requests
+* **Scale up**: New pods start when requests arrive, even from zero replicas
+* **Cold start delay**: First request after scale-to-zero will experience a delay while the pod initializes
+
+---
+
+### 8. Cleanup
 
 To remove KEDA configuration and observability components:
 
 ```bash
-kubectl delete -f assets/values-19-keda.yaml
+kubectl delete -f assets/values-20-keda.yaml
 helm uninstall keda -n keda
 kubectl delete namespace keda
 
diff --git a/tutorials/assets/values-20-keda.yaml b/tutorials/assets/values-20-keda.yaml
@@ -13,7 +13,7 @@ spec:
   # How often KEDA should check the metrics (in seconds)
   pollingInterval: 15
   # How long to wait before scaling down after scaling up (in seconds)
-  cooldownPeriod: 30
+  cooldownPeriod: 360
   # Scaling triggers configuration
   triggers:
     - type: prometheus