feat(chatterbox): support multilingual, MPS, and CPU

mudler · mudler · commit ee5c33560065 · 2025-09-09T16:57:18.000+02:00
Signed-off-by: Ettore Di Giacinto &lt;mudler@localai.io&gt;
diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
@@ -111,6 +111,18 @@ jobs:
             backend: "diffusers"
             dockerfile: "./backend/Dockerfile.python"
             context: "./backend"
+          - build-type: ''
+            cuda-major-version: ""
+            cuda-minor-version: ""
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-cpu-chatterbox'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'true'
+            backend: "chatterbox"
+            dockerfile: "./backend/Dockerfile.python"
+            context: "./backend"
           # CUDA 11 additional backends
           - build-type: 'cublas'
             cuda-major-version: "11"
@@ -968,6 +980,9 @@ jobs:
           - backend: "mlx"
             tag-suffix: "-metal-darwin-arm64-mlx"
             build-type: "mps"
+          - backend: "chatterbox"
+            tag-suffix: "-metal-darwin-arm64-chatterbox"
+            build-type: "mps"
           - backend: "mlx-vlm"
             tag-suffix: "-metal-darwin-arm64-mlx-vlm"
             build-type: "mps"
diff --git a/Makefile b/Makefile
@@ -369,6 +369,9 @@ backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build
 backends/kokoro: docker-build-kokoro docker-save-kokoro build
 	./local-ai backends install "ocifile://$(abspath ./backend-images/kokoro.tar)"
 
+backends/chatterbox: docker-build-chatterbox docker-save-chatterbox build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/chatterbox.tar)"
+
 backends/llama-cpp-darwin: build
 	bash ./scripts/build/llama-cpp-darwin.sh
 	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
@@ -493,7 +496,7 @@ docker-build-bark:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:bark -f backend/Dockerfile.python --build-arg BACKEND=bark .
 
 docker-build-chatterbox:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:chatterbox -f backend/Dockerfile.python --build-arg BACKEND=chatterbox .
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:chatterbox -f backend/Dockerfile.python --build-arg BACKEND=chatterbox ./backend
 
 docker-build-exllama2:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:exllama2 -f backend/Dockerfile.python --build-arg BACKEND=exllama2 .
diff --git a/backend/index.yaml b/backend/index.yaml
@@ -350,6 +350,8 @@
   alias: "chatterbox"
   capabilities:
     nvidia: "cuda12-chatterbox"
+    metal: "metal-chatterbox"
+    default: "cpu-chatterbox"
 - &piper
   name: "piper"
   uri: "quay.io/go-skynet/local-ai-backends:latest-piper"
@@ -1223,6 +1225,28 @@
   name: "chatterbox-development"
   capabilities:
     nvidia: "cuda12-chatterbox-development"
+    metal: "metal-chatterbox-development"
+    default: "cpu-chatterbox-development"
+- !!merge <<: *chatterbox
+  name: "cpu-chatterbox"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-chatterbox"
+  mirrors:
+    - localai/localai-backends:latest-cpu-chatterbox
+- !!merge <<: *chatterbox
+  name: "cpu-chatterbox-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-chatterbox"
+  mirrors:
+    - localai/localai-backends:master-cpu-chatterbox
+- !!merge <<: *chatterbox
+  name: "metal-chatterbox"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-chatterbox"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-chatterbox
+- !!merge <<: *chatterbox
+  name: "metal-chatterbox-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-chatterbox"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-chatterbox
 - !!merge <<: *chatterbox
   name: "cuda12-chatterbox-development"
   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-chatterbox"
diff --git a/backend/python/chatterbox/backend.py b/backend/python/chatterbox/backend.py
@@ -14,9 +14,15 @@
 import torch
 import torchaudio as ta
 from chatterbox.tts import ChatterboxTTS
-
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
 import grpc
 
+def is_float(s):
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
@@ -47,6 +53,27 @@ def LoadModel(self, request, context):
         if not torch.cuda.is_available() and request.CUDA:
             return backend_pb2.Result(success=False, message="CUDA is not available")
 
+
+        options = request.Options
+
+        # empty dict
+        self.options = {}
+
+        # The options are a list of strings in this form optname:optvalue
+        # We are storing all the options in a dict so we can use it later when
+        # generating the images
+        for opt in options:
+            if ":" not in opt:
+                continue
+            key, value = opt.split(":")
+            # if value is a number, convert it to the appropriate type
+            if is_float(value):
+                if value.is_integer():
+                    value = int(value)
+                else:
+                    value = float(value)
+            self.options[key] = value
+
         self.AudioPath = None
 
         if os.path.isabs(request.AudioPath):
@@ -56,10 +83,14 @@ def LoadModel(self, request, context):
             modelFileBase = os.path.dirname(request.ModelFile)
             # modify LoraAdapter to be relative to modelFileBase
             self.AudioPath = os.path.join(modelFileBase, request.AudioPath)
-
         try:
             print("Preparing models, please wait", file=sys.stderr)
-            self.model = ChatterboxTTS.from_pretrained(device=device)
+            if "multilingual" in self.options:
+                # remove key from options
+                del self.options["multilingual"]
+                self.model = ChatterboxMultilingualTTS.from_pretrained(device=device)
+            else:
+                self.model = ChatterboxTTS.from_pretrained(device=device)
         except Exception as err:
             return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
         # Implement your logic here for the LoadModel service
@@ -68,12 +99,18 @@ def LoadModel(self, request, context):
 
     def TTS(self, request, context):
         try:
-            # Generate audio using ChatterboxTTS
+            kwargs = {}
+
+            if "language" in self.options:
+                kwargs["language_id"] = self.options["language"]
             if self.AudioPath is not None:
-                wav = self.model.generate(request.text, audio_prompt_path=self.AudioPath)
-            else:
-                wav = self.model.generate(request.text)
-            
+                kwargs["audio_prompt_path"] = self.AudioPath
+
+            # add options to kwargs
+            kwargs.update(self.options)
+
+            # Generate audio using ChatterboxTTS
+            wav = self.model.generate(request.text, **kwargs)
             # Save the generated audio
             ta.save(request.dst, wav, self.model.sr)
             
diff --git a/backend/python/chatterbox/requirements-cpu.txt b/backend/python/chatterbox/requirements-cpu.txt
@@ -1,3 +1,4 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
 accelerate
 torch==2.6.0
 torchaudio==2.6.0
diff --git a/backend/python/chatterbox/requirements.txt b/backend/python/chatterbox/requirements.txt
@@ -2,4 +2,5 @@ grpcio==1.71.0
 protobuf
 certifi
 packaging
-setuptools
+setuptools
+numpy>=1.24.0,<1.26.0

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+--extra-index-url https://download.pytorch.org/whl/cpu`
`1`	`2`	`accelerate`
`2`	`3`	`torch==2.6.0`
`3`	`4`	`torchaudio==2.6.0`