FluidInference · Alex-Wengg · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/Documentation/TTS/PocketTTS.md b/Documentation/TTS/PocketTTS.md
@@ -145,6 +145,29 @@ fluidaudio tts "Hello world" --backend pocket --voice-file my_voice.bin
 - The `mimi_encoder.mlmodelc` model is downloaded automatically on first use
 - Supports any audio format that AVFoundation can read
 
+### Cloning Across Languages
+
+The Mimi encoder is language-agnostic — voice cloning produces a generic
+acoustic embedding that any language pack's `cond_step` model can consume.
+You can:
+
+- Clone a voice once and reuse the same `PocketTtsVoiceData` across managers
+  configured with different languages.
+- Clone a voice with a Spanish-only manager without pulling in the English
+  language pack — only the encoder subtree is downloaded.
+
+```swift
+// Clone with a Spanish manager
+let esManager = PocketTtsManager(language: .spanish)
+try await esManager.initialize()
+let voiceData = try await esManager.cloneVoice(from: speakerAudioURL)
+
+// Use the same cloned voice with a French manager
+let frManager = PocketTtsManager(language: .french24L)
+try await frManager.initialize()
+let frAudio = try await frManager.synthesize(text: "Bonjour", voiceData: voiceData)
+```
+
 ## Pipeline and Pronunciation Control
 
 ```
@@ -214,6 +237,57 @@ for try await frame in session.frames {
 | Streaming playback | `synthesizeStreaming()` |
 | Streaming text or custom chunking | `makeSession()` |
 
+## Languages
+
+PocketTTS ships with multiple language packs converted from
+[kyutai/pocket-tts](https://huggingface.co/kyutai/pocket-tts). Pick the one
+that matches your input text — there is no automatic language detection.
+
+| ID | Layers | HF Path |
+|----|--------|---------|
+| `english` | 6 | repo root (legacy layout) |
+| `german` | 6 | `v2/german/` |
+| `german_24l` | 24 | `v2/german_24l/` |
+| `italian` | 6 | `v2/italian/` |
+| `italian_24l` | 24 | `v2/italian_24l/` |
+| `portuguese` | 6 | `v2/portuguese/` |
+| `portuguese_24l` | 24 | `v2/portuguese_24l/` |
+| `spanish` | 6 | `v2/spanish/` |
+| `spanish_24l` | 24 | `v2/spanish_24l/` |
+| `french_24l` | 24 | `v2/french_24l/` |
+
+Notes:
+- French only ships a 24-layer pack upstream (no 6-layer variant).
+- 24-layer packs are higher quality but slower and larger.
+- The 21 voice names (alba, anna, eve, michael, …) are shared across
+  languages, but the underlying acoustic embeddings are per-language.
+- Mimi encoder weights (used for voice cloning) are language-agnostic and
+  always live at the repo root.
+
+### Swift API
+
+```swift
+let manager = PocketTtsManager(language: .spanish)
+try await manager.initialize()
+let audio = try await manager.synthesize(text: "Hola mundo")
+```
+
+`PocketTtsManager.language` is immutable per instance. To support multiple
+languages in one app, instantiate one manager per language.
+
+### CLI Usage
+
+```bash
+# Default (English)
+fluidaudio tts "Hello world" --backend pocket --output en.wav
+
+# Spanish (6L)
+fluidaudio tts "Hola mundo" --backend pocket --language spanish --output es.wav
+
+# French (24L only)
+fluidaudio tts "Bonjour" --backend pocket --language french_24l --output fr.wav
+```
+
 ## Usage
 
 PocketTTS is part of core `FluidAudio` - no GPL dependencies required.

diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ Want to convert your own model? Check [möbius](https://github.com/FluidInferenc
 
 - **Automatic Speech Recognition (ASR)**: [Parakeet TDT v3](Documentation/Models.md#batch-transcription-near-real-time) (0.6b) and other TDT/CTC models for batch transcription supporting 25 European languages, Japanese, and Chinese; [Parakeet EOU](Documentation/Models.md#streaming-transcription-true-real-time) (120m) for streaming ASR with end-of-utterance detection (English only). See all [ASR models](Documentation/Models.md#asr-models).
 - **Inverse Text Normalization (ITN)**: Post-process ASR output to convert spoken-form to written-form ("two hundred" → "200"). See [text-processing-rs](https://github.com/FluidInference/text-processing-rs)
-- **Text-to-Speech (TTS)**: Kokoro (82m) for parallel synthesis with SSML and pronunciation control across 9 languages (EN, ES, FR, HI, IT, JA, PT, ZH); PocketTTS for streaming TTS with voice cloning support (English only)
+- **Text-to-Speech (TTS)**: Kokoro (82m) for parallel synthesis with SSML and pronunciation control across 9 languages (EN, ES, FR, HI, IT, JA, PT, ZH); PocketTTS for streaming TTS with voice cloning support (EN, DE, ES, FR, IT, PT — 6L and 24L variants)
 - **Speaker Diarization (Online + Offline)**: Speaker separation and identification across audio streams. Streaming pipeline for real-time processing and offline batch pipeline with advanced clustering.
 - **Speaker Embedding Extraction**: Generate speaker embeddings for voice comparison and clustering, you can use this for speaker identification
 - **Voice Activity Detection (VAD)**: Voice activity detection with Silero models
@@ -556,25 +556,36 @@ FluidAudio ships two TTS backends:
 ### PocketTTS
 
 Streaming-friendly TTS with voice cloning support from short audio samples.
+Available language packs: `english` (default), `german`, `german_24l`,
+`italian`, `italian_24l`, `portuguese`, `portuguese_24l`, `spanish`,
+`spanish_24l`, `french_24l` (24-layer only — no 6-layer French upstream).
 
 ```swift
 import FluidAudio
 
 Task {
-    let manager = try await PocketTtsManager()
-    let audioData = try await manager.synthesize("Hello from FluidAudio.")
+    let manager = PocketTtsManager(language: .spanish)
+    try await manager.initialize()
+    let audioData = try await manager.synthesize(text: "Hola, mundo.")
     try audioData.write(to: URL(fileURLWithPath: "out.wav"))
 }
 ```
 
 ```bash
-# Synthesize with default voice
+# English (default)
 swift run fluidaudiocli tts "Hello from FluidAudio." --output out.wav --backend pocket
 
-# Clone a voice from an audio sample
+# Other languages
+swift run fluidaudiocli tts "Hola mundo" --backend pocket --language spanish --output es.wav
+swift run fluidaudiocli tts "Bonjour" --backend pocket --language french_24l --output fr.wav
+
+# Clone a voice from an audio sample (works with any language pack)
 swift run fluidaudiocli tts "Hello world." --output out.wav --backend pocket --clone-voice speaker.wav
 ```
 
+See [Documentation/TTS/PocketTTS.md](Documentation/TTS/PocketTTS.md#languages)
+for the full language table.
+
 ### Kokoro
 
 High-quality parallel TTS with SSML and phoneme-level pronunciation control. Uses a CoreML G2P (grapheme-to-phoneme) model for out-of-vocabulary words — no external dependencies required.

diff --git a/Sources/FluidAudio/DownloadUtils.swift b/Sources/FluidAudio/DownloadUtils.swift
@@ -575,8 +575,10 @@ public class DownloadUtils {
     public static func downloadSubdirectory(
         _ repo: Repo,
         subdirectory: String,
-        to repoDirectory: URL
+        to repoDirectory: URL,
+        progressHandler: ProgressHandler? = nil
     ) async throws {
+        progressHandler?(DownloadProgress(fractionCompleted: 0.0, phase: .listing))
         var filesToDownload: [(path: String, size: Int)] = []
 
         func listFiles(at path: String) async throws {
@@ -611,12 +613,22 @@ public class DownloadUtils {
         }
 
         try await listFiles(at: subdirectory)
-        logger.info("Found \(filesToDownload.count) files in \(subdirectory)")
+        let totalFiles = filesToDownload.count
+        logger.info("Found \(totalFiles) files in \(subdirectory)")
+        progressHandler?(
+            DownloadProgress(
+                fractionCompleted: totalFiles == 0 ? 1.0 : 0.0,
+                phase: .downloading(completedFiles: 0, totalFiles: totalFiles)))
 
         for (index, file) in filesToDownload.enumerated() {
             let destPath = repoDirectory.appendingPathComponent(file.path)
 
             if FileManager.default.fileExists(atPath: destPath.path) {
+                progressHandler?(
+                    DownloadProgress(
+                        fractionCompleted: Double(index + 1) / Double(totalFiles),
+                        phase: .downloading(
+                            completedFiles: index + 1, totalFiles: totalFiles)))
                 continue
             }
 
@@ -658,8 +670,14 @@ public class DownloadUtils {
             }
             try FileManager.default.moveItem(at: tempURL, to: destPath)
 
-            if (index + 1) % 5 == 0 || index == filesToDownload.count - 1 {
-                logger.info("Downloaded \(index + 1)/\(filesToDownload.count) \(subdirectory) files")
+            progressHandler?(
+                DownloadProgress(
+                    fractionCompleted: Double(index + 1) / Double(totalFiles),
+                    phase: .downloading(
+                        completedFiles: index + 1, totalFiles: totalFiles)))
+
+            if (index + 1) % 5 == 0 || index == totalFiles - 1 {
+                logger.info("Downloaded \(index + 1)/\(totalFiles) \(subdirectory) files")
             }
         }
 

diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
@@ -565,25 +565,41 @@ public enum ModelNames {
         public static let condStep = "cond_step"
         public static let flowlmStep = "flowlm_step"
         public static let flowDecoder = "flow_decoder"
-        public static let mimiDecoder = "mimi_decoder_v2"
+        /// Legacy English (root-of-repo) Mimi decoder file basename.
+        public static let mimiDecoderLegacy = "mimi_decoder_v2"
+        /// New per-language `v2/<lang>/` Mimi decoder file basename.
+        public static let mimiDecoderV2 = "mimi_decoder"
         public static let mimiEncoder = "mimi_encoder"
 
         public static let condStepFile = condStep + ".mlmodelc"
         public static let flowlmStepFile = flowlmStep + ".mlmodelc"
         public static let flowDecoderFile = flowDecoder + ".mlmodelc"
-        public static let mimiDecoderFile = mimiDecoder + ".mlmodelc"
+        public static let mimiDecoderLegacyFile = mimiDecoderLegacy + ".mlmodelc"
+        public static let mimiDecoderV2File = mimiDecoderV2 + ".mlmodelc"
         public static let mimiEncoderFile = mimiEncoder + ".mlmodelc"
 
         /// Directory containing binary constants, tokenizer, and voice data.
         public static let constantsBinDir = "constants_bin"
 
-        public static let requiredModels: Set<String> = [
-            condStepFile,
-            flowlmStepFile,
-            flowDecoderFile,
-            mimiDecoderFile,
-            constantsBinDir,
-        ]
+        /// Returns the Mimi decoder filename used inside this language's pack.
+        public static func mimiDecoderFile(for language: PocketTtsLanguage) -> String {
+            language == .english ? mimiDecoderLegacyFile : mimiDecoderV2File
+        }
+
+        /// Required models inside the language root for the given language.
+        ///
+        /// English (legacy root) and other languages use different Mimi
+        /// decoder filenames, but all four model directories plus the
+        /// `constants_bin/` directory must be present.
+        public static func requiredModels(for language: PocketTtsLanguage) -> Set<String> {
+            [
+                condStepFile,
+                flowlmStepFile,
+                flowDecoderFile,
+                mimiDecoderFile(for: language),
+                constantsBinDir,
+            ]
+        }
 
         /// Models required for voice cloning (optional feature).
         public static let voiceCloningModels: Set<String> = [
@@ -743,7 +759,7 @@ public enum ModelNames {
             return ttsModels.union(ModelNames.G2P.requiredModels)
                 .union(ModelNames.MultilingualG2P.requiredModels)
         case .pocketTts:
-            return ModelNames.PocketTTS.requiredModels
+            return ModelNames.PocketTTS.requiredModels(for: .english)
         case .sortformer:
             if let variant = variant {
                 return [variant]