alibaba · jpjamespeterson · Aug 12, 2025 · Aug 12, 2025 · lingma-agents · Aug 12, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,24 @@
+name: CI
+
+on:
+  pull_request:
+  push:
+    branches: [ feat/**, main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.22.x'
+      - name: Tidy
+        run: |
+          go mod tidy
+      - name: Unit tests
+        run: |
+          go test ./pkg/... ./test/... -run Test -v
diff --git a/README.md b/README.md
@@ -174,3 +174,14 @@ Higress would not be possible without the valuable open-source work of projects
         ↑ Back to Top ↑
     </a>
 </p>
+
+## New Enterprise Features
+
+- Multi-Cluster support (Karmada): propagate `ConfigMap` and CRDs across clusters using `ClusterPropagationPolicy`. See `docs/multicluster.md`.
+- AI Autoscaling (KEDA): scale based on LLM metrics (e.g., token usage). See `docs/autoscaling.md` and Helm `keda.*` values.
+- Gateway API surfacing: explicit gateway controller adapter in `pkg/gateway` for reconciliation wiring.
+- Enhanced Observability: Prometheus metrics for AI token usage and latency; OpenTelemetry tracing configuration. See `docs/observability.md`.
+- Multi-Tenancy: namespace-based isolation wired into gateway controller, configurable via `controller.tenantNamespaces` Helm value.
+- Batch AI Workloads (Volcano): submit batch jobs via Volcano. See `docs/volcano.md`.
+
+Each feature is toggleable via Helm values and implemented modularly under `pkg/`.
diff --git a/docs/autoscaling.md b/docs/autoscaling.md
@@ -0,0 +1,14 @@
+# Autoscaling for AI Workloads (KEDA)
+
+Enable KEDA-driven autoscaling from Prometheus metrics:
+
+```bash
+helm upgrade --install higress ./helm/core \
+  --set keda.enabled=true \
+  --set keda.metric.name=higress_ai_token_usage_total \
+  --set keda.metric.threshold=100
+```
+
+At runtime, the ai-proxy plugin emits metrics which can be scraped by Prometheus and used by KEDA ScaledObject.
+
+To manage ScaledObjects programmatically, build with `-tags keda` and use `pkg/autoscaler.NewKedaScaler`.
diff --git a/docs/multicluster.md b/docs/multicluster.md
@@ -0,0 +1,28 @@
+# Multi-Cluster with Karmada
+
+This guide shows how to enable multi-cluster configuration sync for Higress using Karmada.
+
+## Prerequisites
+- A running Karmada control plane and at least one member cluster. See `https://karmada.io/docs`.
+- Higress installed via Helm.
+
+## Enable Helm options
+
+Set the following values:
+
+```bash
+helm upgrade --install higress ./helm/core \
+  --set karmada.enabled=true
+```
+
+This installs a ClusterPropagationPolicy that propagates the Higress ConfigMap to all clusters.
+
+## Programmatic usage
+
+Build Higress with Karmada integration to use the `pkg/karmada` package:
+
+```bash
+CGO_ENABLED=0 go build -tags karmada ./...
+```
+
+Then use `karmada.NewKarmadaSync(client)` and call `SyncConfigMap` / `SyncCRD`.
diff --git a/docs/observability.md b/docs/observability.md
@@ -0,0 +1,24 @@
+# Observability for AI Workloads
+
+Higress can export AI-specific metrics and traces.
+
+## Metrics (Prometheus)
+- Counter: `higress_ai_token_usage_total`
+- Histogram: `higress_ai_model_latency_milliseconds`
+
+Enable Prometheus scraping with Helm:
+
+```bash
+helm upgrade --install higress ./helm/core \
+  --set observability.prometheus.enabled=true
+```
+
+## Tracing (OpenTelemetry)
+Enable the built-in OpenTelemetry Collector via Helm:
+
+```bash
+helm upgrade --install higress ./helm/core \
+  --set observability.otelCollector.enabled=true
+```
+
+Programmatically, initialize tracing via `pkg/observability.SetupTracing`.
diff --git a/docs/tenancy.md b/docs/tenancy.md
@@ -0,0 +1,7 @@
+# Multi-Tenancy
+
+Higress can isolate resources by Kubernetes namespace. Use `pkg/tenancy.TenantManager` to validate and check namespace access.
+
+RBAC: ensure each tenant has access only to their namespace resources.
+
+Future work: integrate filters in the gateway reconcilers to enforce tenant scoping.
diff --git a/docs/volcano.md b/docs/volcano.md
@@ -0,0 +1,10 @@
+# Volcano Integration for Batch AI Workloads
+
+Enable the Volcano sample job with Helm:
+
+```bash
+helm upgrade --install higress ./helm/core \
+  --set volcano.enabled=true
+```
+
+To programmatically submit jobs from code, build with `-tags volcano` and use `pkg/volcano.NewVolcanoScheduler`.
diff --git a/go.mod b/go.mod
@@ -57,6 +57,11 @@ require (
 	sigs.k8s.io/gateway-api v0.8.0
 	sigs.k8s.io/structured-merge-diff/v4 v4.3.0
 	sigs.k8s.io/yaml v1.4.0
+	// Added for Observability
+	github.com/prometheus/client_golang v1.17.0
+	go.opentelemetry.io/otel v1.26.0
+	go.opentelemetry.io/otel/sdk v1.26.0
+	go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.26.0
 )
 
 require (
@@ -191,7 +196,6 @@ require (
 	github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
-	github.com/prometheus/client_golang v1.17.0 // indirect
 	github.com/prometheus/client_model v0.5.0 // indirect
 	github.com/prometheus/common v0.45.0 // indirect
 	github.com/prometheus/procfs v0.12.0 // indirect
@@ -213,10 +217,8 @@ require (
 	github.com/yl2chen/cidranger v1.0.2 // indirect
 	github.com/zeebo/blake3 v0.2.3 // indirect
 	go.opencensus.io v0.24.0 // indirect
-	go.opentelemetry.io/otel v1.17.0 // indirect
 	go.opentelemetry.io/otel/exporters/prometheus v0.39.1-0.20230714155235-03b8c47770f2 // indirect
 	go.opentelemetry.io/otel/metric v1.17.0 // indirect
-	go.opentelemetry.io/otel/sdk v1.16.0 // indirect
 	go.opentelemetry.io/otel/sdk/metric v0.39.0 // indirect
 	go.opentelemetry.io/otel/trace v1.17.0 // indirect
 	go.opentelemetry.io/proto/otlp v1.0.0 // indirect

diff --git a/helm/core/templates/controller-deployment.yaml b/helm/core/templates/controller-deployment.yaml
@@ -65,6 +65,10 @@ spec:
               fieldRef:
                 apiVersion: v1
                 fieldPath: metadata.namespace
+          {{- if .Values.controller.tenantNamespaces }}
+          - name: HIGRESS_TENANT_NAMESPACES
+            value: {{ join "," .Values.controller.tenantNamespaces | quote }}
+          {{- end }}
           - name: SERVICE_ACCOUNT
             valueFrom:
               fieldRef:

diff --git a/helm/core/templates/karmada-cpp.yaml b/helm/core/templates/karmada-cpp.yaml
@@ -0,0 +1,14 @@
+{{- if .Values.karmada.enabled }}
+apiVersion: policy.karmada.io/v1alpha1
+kind: ClusterPropagationPolicy
+metadata:
+  name: higress-configmaps
+spec:
+  resourceSelectors:
+    - apiVersion: v1
+      kind: ConfigMap
+      name: higress-config
+      namespace: {{ .Release.Namespace }}
+  placement:
+    clusterAffinity: {}
+{{- end }}
diff --git a/helm/core/templates/keda-scaledobject.yaml b/helm/core/templates/keda-scaledobject.yaml
@@ -0,0 +1,17 @@
+{{- if .Values.keda.enabled }}
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: {{ include "controller.name" . }}-scaledobject
+  namespace: {{ .Release.Namespace }}
+spec:
+  scaleTargetRef:
+    kind: Deployment
+    name: {{ include "controller.name" . }}
+  triggers:
+    - type: prometheus
+      metadata:
+        serverAddress: {{ .Values.keda.prometheus.serverAddress | quote }}
+        metricName: {{ .Values.keda.metric.name | quote }}
+        threshold: {{ .Values.keda.metric.threshold | quote }}
+{{- end }}
diff --git a/helm/core/templates/otel-collector.yaml b/helm/core/templates/otel-collector.yaml
@@ -0,0 +1,47 @@
+{{- if .Values.observability.otelCollector.enabled }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "higress.fullname" . }}-otel-collector
+  namespace: {{ .Release.Namespace }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: {{ include "higress.fullname" . }}-otel-collector
+  template:
+    metadata:
+      labels:
+        app: {{ include "higress.fullname" . }}-otel-collector
+    spec:
+      containers:
+        - name: otel-collector
+          image: {{ .Values.observability.otelCollector.image | default "otel/opentelemetry-collector:0.102.1" | quote }}
+          args: ["--config=/etc/otel/config.yaml"]
+          volumeMounts:
+            - name: config
+              mountPath: /etc/otel
+      volumes:
+        - name: config
+          configMap:
+            name: {{ include "higress.fullname" . }}-otel-config
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "higress.fullname" . }}-otel-config
+  namespace: {{ .Release.Namespace }}
+data:
+  config.yaml: |
+    receivers:
+      otlp:
+        protocols:
+          http:
+    exporters:
+      logging: {}
+    service:
+      pipelines:
+        traces:
+          receivers: [otlp]
+          exporters: [logging]
+{{- end }}
diff --git a/helm/core/templates/prometheus.yaml b/helm/core/templates/prometheus.yaml
@@ -0,0 +1,17 @@
+{{- if .Values.observability.prometheus.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "higress.fullname" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    release: prometheus
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: {{ include "higress.name" . }}
+  endpoints:
+    - port: metrics
+      path: /metrics
+      interval: 30s
+{{- end }}
diff --git a/helm/core/templates/volcano-job.yaml b/helm/core/templates/volcano-job.yaml
@@ -0,0 +1,19 @@
+{{- if .Values.volcano.enabled }}
+apiVersion: batch.volcano.sh/v1alpha1
+kind: Job
+metadata:
+  name: {{ include "higress.fullname" . }}-sample-batch
+  namespace: {{ .Release.Namespace }}
+spec:
+  minAvailable: 1
+  tasks:
+    - replicas: 1
+      name: infer
+      template:
+        spec:
+          restartPolicy: Never
+          containers:
+            - name: infer
+              image: {{ .Values.volcano.image | default "busybox" | quote }}
+              command: ["/bin/sh", "-c", "echo sample batch job"]
+{{- end }}
diff --git a/helm/core/values.yaml b/helm/core/values.yaml
@@ -551,6 +551,7 @@ controller:
   hub: higress-registry.cn-hangzhou.cr.aliyuncs.com/higress
   tag: ""
   env: {}
+  tenantNamespaces: []
 
   labels: {}
 
@@ -795,3 +796,25 @@ pluginServer:
     limits:
       cpu: 500m
       memory: 256Mi
+
+karmada:
+  enabled: false
+
+keda:
+  enabled: false
+  prometheus:
+    serverAddress: http://prometheus-server.monitoring.svc.cluster.local
+  metric:
+    name: higress_ai_token_usage_total
+    threshold: "100"
+
+observability:
+  prometheus:
+    enabled: false
+  otelCollector:
+    enabled: false
+    image: otel/opentelemetry-collector:0.102.1
+
+volcano:
+  enabled: false
+  image: busybox
@@ -0,0 +1,58 @@
+//go:build keda
+
+package autoscaler
+
+import (
+	context "context"
+	"fmt"
+
+	kedav1alpha1 "github.com/kedacore/keda/v2/apis/keda/v1alpha1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+type KedaScaler struct {
+	Client      client.Client
+	Namespace   string
+	Deployment  string
+}
+
+func NewKedaScaler(c client.Client, namespace, deployment string) *KedaScaler {
+	return &KedaScaler{Client: c, Namespace: namespace, Deployment: deployment}
+}
+
+// ScaleByLLMMetrics creates/updates a ScaledObject using a Prometheus trigger on a given metric.
+func (s *KedaScaler) ScaleByLLMMetrics(metricName string, targetValue int) error {
+	if s == nil || s.Client == nil || metricName == "" {
+		return nil
+	}
+	ctx := context.TODO()
+	so := &kedav1alpha1.ScaledObject{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "higress-" + s.Deployment + "-scaledobject",
+			Namespace: s.Namespace,
+		},
+		Spec: kedav1alpha1.ScaledObjectSpec{
+			ScaleTargetRef: &kedav1alpha1.ScaleTarget{
+				Kind: "Deployment",
+				Name: s.Deployment,
+			},
+			Triggers: []kedav1alpha1.ScaleTriggers{
+				{
+					Type: "prometheus",
+					Metadata: map[string]string{
+						"serverAddress": "http://prometheus-server.monitoring.svc.cluster.local",
+						"metricName":    metricName,
+						"threshold":     fmt.Sprintf("%d", targetValue),
+					},
+				},
+			},
+		},
+	}
+	var existing kedav1alpha1.ScaledObject
+	if err := s.Client.Get(ctx, client.ObjectKey{Name: so.Name, Namespace: s.Namespace}, &existing); err == nil {
+		existing.Spec = so.Spec
+		return s.Client.Update(ctx, &existing)
+	}
+	return s.Client.Create(ctx, so)
+}