From 4daa0fc1cdd4f63e1eea28f897fb93d8c51ab25a Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Tue, 21 Apr 2026 16:38:59 -0400 Subject: [PATCH 01/17] feat(tts): add CosyVoice3 Mandarin TTS port (Phase 1 + Phase 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Swift port of CosyVoice3 zero-shot Mandarin TTS targeting the four validated CoreML mlpackages hosted at FluidInference/CosyVoice3-0.5B-coreml. Mirrors the Kokoro manager API shape (public actor, init, initialize, synthesize → Data). Phase 1 — parity harness - CosyVoice3ModelStore loads LLM-Prefill-T256-M768, LLM-Decode-M768, Flow-N250-fp32, HiFT-T500-fp16 from a local build dir or HF repo - SafetensorsReader: pure-Swift mmap + typed accessors (fp16/fp32/i32) - CosyVoice3RasSampler: top-p / top-k / repetition mask, with seedTokens() bypass for parity tests - CosyVoice3Synthesizer: prefill → decode loop with in-place KV-cache passthrough [24,1,2,768,64] fp16 → Flow (N=250) → HiFT (T=500) - Speech embedding lazy mmap (6761×896 fp16) - Frontend fixture ingest for parity against Python reference WAV Phase 2 — native Mandarin frontend - Qwen2 byte-level BPE tokenizer (tiktoken-compatible), 151 936 vocab - Qwen2 text embedding table lookup (151 936×896 fp16 mmap) - CosyVoice3TextFrontend: special-token splitting, lm_input assembly - CosyVoice3ChineseNormalizer: minimal regex-free TN port of frontend_utils.py (replace_blank, corner marks, brackets, digit spellout, trailing comma collapse). Callers can pass prenormalized: true to bypass. - CosyVoice3PromptMel: 24 kHz log-mel matching matcha audio.py (n_fft=1920, hop=480, win=1920, num_mels=80, reflect-pad 720, center=False, Slaney norm, log floor 1e-5, magnitude eps 1e-9) Public API - CosyVoice3TtsManager: actor with init(directory:), initialize(), synthesize(text:promptAssets:options:prenormalized:), and downloadAndCreate(from repo:) - CosyVoice3PromptAssets: prompt text + speech IDs + mel + speaker embedding bundle, loadable from safetensors CLI (Sources/FluidAudioCLI/Commands/) - cosyvoice3-parity: fixture → WAV, compares to reference - cosyvoice3-text: text → audio via full frontend - cosyvoice3-tokenizer: Qwen2 BPE parity harness - cosyvoice3-frontend: dump assembled lm_input for debugging Integration - TtsBackend.swift: +case cosyvoice3 - ModelNames.swift: +CosyVoice3 enum + Repo.cosyvoice3 Tests (XCTest) - CosyVoice3ChineseNormalizerTests (8 cases, end-to-end parity) - CosyVoice3PromptMelTests (8 cases: frame count, zero clamp, sine argmax, reflect pad, Hann, mel basis, trim-to-token-ratio) Full swift test: 1435 tests, 24 skipped, 0 failures. Models on HF: https://huggingface.co/FluidInference/CosyVoice3-0.5B-coreml Conversion pipeline: FluidInference/mobius PR #42 Co-Authored-By: Claude --- Sources/FluidAudio/ModelNames.swift | 48 +++ .../Assets/CosyVoice3ModelStore.swift | 165 ++++++++ .../Assets/CosyVoice3ResourceDownloader.swift | 218 +++++++++++ .../TTS/CosyVoice3/CosyVoice3Constants.swift | 62 +++ .../TTS/CosyVoice3/CosyVoice3Error.swift | 37 ++ .../TTS/CosyVoice3/CosyVoice3Models.swift | 17 + .../TTS/CosyVoice3/CosyVoice3TtsManager.swift | 292 ++++++++++++++ .../CosyVoice3ChineseNormalizer.swift | 145 +++++++ .../CosyVoice3FrontendFixture.swift | 101 +++++ .../Preprocess/CosyVoice3PromptAssets.swift | 115 ++++++ .../Preprocess/CosyVoice3PromptMel.swift | 307 +++++++++++++++ .../Preprocess/CosyVoice3TextEmbeddings.swift | 142 +++++++ .../Preprocess/CosyVoice3TextFrontend.swift | 63 ++++ .../Preprocess/Qwen2BpeTokenizer.swift | 230 ++++++++++++ .../Preprocess/Qwen2ByteEncoder.swift | 56 +++ .../Synthesize/CosyVoice3RasSampler.swift | 175 +++++++++ .../CosyVoice3SpeechEmbeddings.swift | 65 ++++ .../Synthesize/CosyVoice3Synthesizer.swift | 355 ++++++++++++++++++ .../Pipeline/Synthesize/CosyVoice3Types.swift | 50 +++ .../CosyVoice3/Shared/SafetensorsReader.swift | 167 ++++++++ Sources/FluidAudio/TTS/TtsBackend.swift | 2 + .../CosyVoice3FrontendParityCommand.swift | 146 +++++++ .../Commands/CosyVoice3ParityCommand.swift | 195 ++++++++++ .../Commands/CosyVoice3TextCommand.swift | 135 +++++++ .../CosyVoice3TokenizerParityCommand.swift | 70 ++++ .../FluidAudioCLI/Commands/TTSCommand.swift | 167 ++++++++ .../CosyVoice3ChineseNormalizerTests.swift | 81 ++++ .../TTS/CosyVoice3PromptMelTests.swift | 101 +++++ 28 files changed, 3707 insertions(+) create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3FrontendFixture.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextFrontend.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2ByteEncoder.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Synthesizer.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Types.swift create mode 100644 Sources/FluidAudio/TTS/CosyVoice3/Shared/SafetensorsReader.swift create mode 100644 Sources/FluidAudioCLI/Commands/CosyVoice3FrontendParityCommand.swift create mode 100644 Sources/FluidAudioCLI/Commands/CosyVoice3ParityCommand.swift create mode 100644 Sources/FluidAudioCLI/Commands/CosyVoice3TextCommand.swift create mode 100644 Sources/FluidAudioCLI/Commands/CosyVoice3TokenizerParityCommand.swift create mode 100644 Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift create mode 100644 Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 95276c177..b689801b2 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -25,6 +25,7 @@ public enum Repo: String, CaseIterable, Sendable { case qwen3AsrInt8 = "FluidInference/qwen3-asr-0.6b-coreml/int8" case multilingualG2p = "FluidInference/charsiu-g2p-byt5-coreml" case parakeetTdtCtc110m = "FluidInference/parakeet-tdt-ctc-110m-coreml" + case cosyvoice3 = "FluidInference/CosyVoice3-0.5B-coreml" /// Repository slug (without owner) public var name: String { @@ -75,6 +76,8 @@ public enum Repo: String, CaseIterable, Sendable { return "charsiu-g2p-byt5-coreml" case .parakeetTdtCtc110m: return "parakeet-tdt-ctc-110m-coreml" + case .cosyvoice3: + return "CosyVoice3-0.5B-coreml" } } @@ -159,6 +162,8 @@ public enum Repo: String, CaseIterable, Sendable { return "parakeet-ja" case .parakeetTdtCtc110m: return "parakeet-tdt-ctc-110m" + case .cosyvoice3: + return "cosyvoice3" default: return name.replacingOccurrences(of: "-coreml", with: "") } @@ -560,6 +565,47 @@ public enum ModelNames { ] } + /// CosyVoice3 (Mandarin) model names. Files live on HuggingFace at + /// `FluidInference/CosyVoice3-0.5B-coreml` (see `Repo.cosyvoice3`). The + /// expected local directory layout is encoded in `CosyVoice3Constants.Files`. + public enum CosyVoice3 { + public static let llmPrefill = "LLM-Prefill-T256-M768-fp16" + public static let llmDecode = "LLM-Decode-M768-fp16" + public static let flow = "Flow-N250-fp32" + public static let hift = "HiFT-T500-fp16" + public static let speechEmbeddings = "speech_embedding-fp16.safetensors" + + public static let llmPrefillFile = llmPrefill + ".mlmodelc" + public static let llmDecodeFile = llmDecode + ".mlmodelc" + public static let flowFile = flow + ".mlmodelc" + public static let hiftFile = hift + ".mlmodelc" + + public static let requiredModels: Set = [ + llmPrefillFile, + llmDecodeFile, + flowFile, + hiftFile, + ] + + /// Sidecar assets living under subdirectories of the HF repo (not part + /// of `requiredModels`; pulled via `downloadSubdirectory` / direct file + /// fetch by `CosyVoice3ResourceDownloader`). + public enum Sidecar { + public static let embeddingsDir = "embeddings" + public static let tokenizerDir = "tokenizer" + public static let voicesDir = "voices" + + public static let speechEmbeddings = "speech_embedding-fp16.safetensors" + public static let runtimeEmbeddings = "embeddings-runtime-fp32.safetensors" + public static let specialTokens = "special_tokens.json" + public static let vocab = "vocab.json" + public static let merges = "merges.txt" + public static let tokenizerConfig = "tokenizer_config.json" + + public static let defaultVoiceId = "cosyvoice3-default-zh" + } + } + /// Multilingual G2P (CharsiuG2P ByT5) model names public enum MultilingualG2P { public static let encoder = "MultilingualG2PEncoder" @@ -688,6 +734,8 @@ public enum ModelNames { return ModelNames.Qwen3ASR.requiredModelsFull case .multilingualG2p: return ModelNames.MultilingualG2P.requiredModels + case .cosyvoice3: + return ModelNames.CosyVoice3.requiredModels } } } diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift new file mode 100644 index 000000000..75c6d7b69 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift @@ -0,0 +1,165 @@ +@preconcurrency import CoreML +import Foundation + +/// Actor-based store for the four CosyVoice3 CoreML models. +/// +/// Two on-disk layouts are accepted: +/// +/// 1. **HuggingFace cache** (flat): `/.mlmodelc` (or +/// `.mlpackage`) at repo root, with `/embeddings/speech_embedding-fp16.safetensors`. +/// This is what `CosyVoice3ResourceDownloader` produces. +/// +/// 2. **Local mobius build dir**: `//.mlpackage` as +/// emitted by `models/tts/cosyvoice3/coreml/convert-coreml.py` (with +/// `llm-fp16/`, `flow-fp32-n250/`, `hift-fp16-t500/` subdirs). +/// +/// The store probes layout (1) first, then falls back to (2). CoreML +/// auto-compiles `.mlpackage` on first load and caches the compiled bundle on +/// disk. +public actor CosyVoice3ModelStore { + + private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "CosyVoice3ModelStore") + + public nonisolated let directory: URL + private let computeUnits: MLComputeUnits + + private var loadedModels: CosyVoice3Models? + private var speechEmbeddingsURL: URL? + + /// - Parameters: + /// - directory: Base build directory that contains + /// `llm-fp16/`, `flow-fp32-n250/`, `hift-fp16-t500/`, `embeddings/`. + /// - computeUnits: Defaults to `.cpuAndNeuralEngine`. Tests force + /// `.cpuOnly` for tight tolerance parity against the Python reference. + public init(directory: URL, computeUnits: MLComputeUnits = .cpuAndNeuralEngine) { + self.directory = directory + self.computeUnits = computeUnits + } + + /// Load all four CoreML models. Idempotent. + public func loadIfNeeded() async throws { + guard loadedModels == nil else { return } + + let config = MLModelConfiguration() + config.computeUnits = computeUnits + + let loadStart = Date() + logger.info("Loading CosyVoice3 CoreML models from \(directory.path)...") + + let prefillURL = try resolveModel( + subdir: CosyVoice3Constants.Files.llmPrefillSubdir, + baseName: ModelNames.CosyVoice3.llmPrefill) + let decodeURL = try resolveModel( + subdir: CosyVoice3Constants.Files.llmDecodeSubdir, + baseName: ModelNames.CosyVoice3.llmDecode) + let flowURL = try resolveModel( + subdir: CosyVoice3Constants.Files.flowSubdir, + baseName: ModelNames.CosyVoice3.flow) + let hiftURL = try resolveModel( + subdir: CosyVoice3Constants.Files.hiftSubdir, + baseName: ModelNames.CosyVoice3.hift) + let embeddingsURL = try resolveAsset( + subdir: CosyVoice3Constants.Files.speechEmbeddingsSubdir, + file: CosyVoice3Constants.Files.speechEmbeddings) + + let prefill = try await compileAndLoad(prefillURL, configuration: config) + logger.info("Loaded \(CosyVoice3Constants.Files.llmPrefill)") + + let decode = try await compileAndLoad(decodeURL, configuration: config) + logger.info("Loaded \(CosyVoice3Constants.Files.llmDecode)") + + // Flow is fp32; ANE cannot run the full graph. If the caller asked for + // CPU-only (parity harness), honor it so results match the Python + // reference byte-for-byte. Otherwise use CPU+GPU to avoid silent ANE + // fallback warnings. + let flowConfig = MLModelConfiguration() + flowConfig.computeUnits = (computeUnits == .cpuOnly) ? .cpuOnly : .cpuAndGPU + let flow = try await compileAndLoad(flowURL, configuration: flowConfig) + logger.info("Loaded \(CosyVoice3Constants.Files.flow)") + + let hift = try await compileAndLoad(hiftURL, configuration: config) + logger.info("Loaded \(CosyVoice3Constants.Files.hift)") + + loadedModels = CosyVoice3Models(prefill: prefill, decode: decode, flow: flow, hift: hift) + speechEmbeddingsURL = embeddingsURL + + let elapsed = Date().timeIntervalSince(loadStart) + logger.info("All CosyVoice3 models loaded in \(String(format: "%.2f", elapsed))s") + } + + public func models() throws -> CosyVoice3Models { + guard let models = loadedModels else { + throw CosyVoice3Error.notInitialized + } + return models + } + + public func speechEmbeddingsFileURL() throws -> URL { + guard let url = speechEmbeddingsURL else { + throw CosyVoice3Error.notInitialized + } + return url + } + + // MARK: - Helpers + + /// Resolve a CoreML model accepting either `.mlmodelc` or `.mlpackage` + /// extensions and both layouts: flat (HF) or subdir (local build). + private func resolveModel(subdir: String, baseName: String) throws -> URL { + let candidates: [URL] = [ + // HF flat layout prefers the precompiled .mlmodelc. + directory.appendingPathComponent("\(baseName).mlmodelc"), + directory.appendingPathComponent("\(baseName).mlpackage"), + // Local build layout (mobius convert-coreml.py output). + directory.appendingPathComponent(subdir).appendingPathComponent("\(baseName).mlmodelc"), + directory.appendingPathComponent(subdir).appendingPathComponent("\(baseName).mlpackage"), + ] + for url in candidates where FileManager.default.fileExists(atPath: url.path) { + return url + } + let probed = candidates.map { $0.path }.joined(separator: ", ") + throw CosyVoice3Error.modelFileNotFound(probed) + } + + /// Resolve a plain sidecar file (e.g. `speech_embedding-fp16.safetensors`). + /// Probes `//` then `/`. + private func resolveAsset(subdir: String, file: String) throws -> URL { + let candidates: [URL] = [ + directory.appendingPathComponent(subdir).appendingPathComponent(file), + directory.appendingPathComponent(file), + ] + for url in candidates where FileManager.default.fileExists(atPath: url.path) { + return url + } + let probed = candidates.map { $0.path }.joined(separator: ", ") + throw CosyVoice3Error.modelFileNotFound(probed) + } + + /// Compile an .mlpackage to .mlmodelc (cached in a persistent temp dir + /// next to the original package) and load it. Skips compilation if an + /// already-compiled .mlmodelc exists next to the package. + private func compileAndLoad( + _ url: URL, + configuration: MLModelConfiguration + ) async throws -> MLModel { + if url.pathExtension == "mlmodelc" { + return try MLModel(contentsOf: url, configuration: configuration) + } + let base = url.deletingPathExtension().lastPathComponent + let compiledName = base + ".mlmodelc" + let cached = url.deletingLastPathComponent().appendingPathComponent(compiledName) + if FileManager.default.fileExists(atPath: cached.path) { + return try MLModel(contentsOf: cached, configuration: configuration) + } + let compiledURL = try await MLModel.compileModel(at: url) + // Move into place next to the package so subsequent loads are fast. + try? FileManager.default.removeItem(at: cached) + do { + try FileManager.default.moveItem(at: compiledURL, to: cached) + return try MLModel(contentsOf: cached, configuration: configuration) + } catch { + // If the move fails (e.g. cross-device), load from the temp URL. + return try MLModel(contentsOf: compiledURL, configuration: configuration) + } + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift new file mode 100644 index 000000000..0776b2b86 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift @@ -0,0 +1,218 @@ +import Foundation + +/// Pulls CosyVoice3 CoreML models + runtime assets from the +/// `FluidInference/CosyVoice3-0.5B-coreml` HuggingFace repo. +/// +/// Layout produced on disk (relative to `ensureCoreModels(...)`'s return URL): +/// +/// ``` +/// / +/// ├── LLM-Prefill-T256-M768-fp16.mlmodelc/ +/// ├── LLM-Decode-M768-fp16.mlmodelc/ +/// ├── Flow-N250-fp32.mlmodelc/ +/// ├── HiFT-T500-fp16.mlmodelc/ +/// ├── embeddings/ +/// │ ├── speech_embedding-fp16.safetensors +/// │ └── embeddings-runtime-fp32.safetensors (text-mode only) +/// ├── tokenizer/ +/// │ ├── vocab.json, merges.txt, tokenizer_config.json, special_tokens.json +/// └── voices/ +/// ├── cosyvoice3-default-zh.safetensors + .json (default voice, eager) +/// └── .safetensors + .json (optional, on-demand) +/// ``` +public enum CosyVoice3ResourceDownloader { + + private static let logger = AppLogger( + subsystem: "com.fluidaudio.tts", category: "CosyVoice3ResourceDownloader") + + /// Path bundle produced by `ensureTextFrontendAssets`. + public struct TextFrontendPaths: Sendable { + public let tokenizerDirectory: URL + public let runtimeEmbeddingsFile: URL + public let specialTokensFile: URL + } + + // MARK: - Core models + speech embedding table + + /// Ensure the four `.mlmodelc` bundles and `speech_embedding-fp16.safetensors` + /// are cached locally. Returns the repository root directory. + /// + /// - Parameters: + /// - directory: Optional base cache dir. When `nil`, defaults to + /// `~/.cache/fluidaudio` (macOS) or `Caches/fluidaudio` (iOS). + /// - progressHandler: Forwarded to `DownloadUtils.downloadRepo`. + @discardableResult + public static func ensureCoreModels( + directory: URL? = nil, + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> URL { + let targetDir = try directory ?? cacheDirectory() + let modelsDirectory = targetDir.appendingPathComponent( + CosyVoice3Constants.defaultModelsSubdirectory) + let repoDir = modelsDirectory.appendingPathComponent(Repo.cosyvoice3.folderName) + + // 1. Fetch the four .mlmodelc bundles via the standard repo downloader. + let modelsPresent = ModelNames.CosyVoice3.requiredModels.allSatisfy { name in + FileManager.default.fileExists( + atPath: repoDir.appendingPathComponent(name).path) + } + if !modelsPresent { + logger.info("Downloading CosyVoice3 .mlmodelc bundles from HuggingFace...") + try await DownloadUtils.downloadRepo( + .cosyvoice3, + to: modelsDirectory, + progressHandler: progressHandler) + } else { + logger.info("CosyVoice3 .mlmodelc bundles found in cache") + } + + // 2. Fetch the small speech-embedding table (sidecar, not a model). + _ = try await ensureSidecarFile( + subdir: ModelNames.CosyVoice3.Sidecar.embeddingsDir, + name: ModelNames.CosyVoice3.Sidecar.speechEmbeddings, + repoDirectory: repoDir, + description: "CosyVoice3 speech embedding table") + + return repoDir + } + + // MARK: - Text-mode assets (tokenizer + 542 MB runtime embeddings) + + /// Ensure tokenizer assets + `embeddings-runtime-fp32.safetensors` are on + /// disk. Only required when using `CosyVoice3TtsManager.synthesize(text:…)`; + /// fixture-mode callers may skip this. + public static func ensureTextFrontendAssets( + repoDirectory: URL + ) async throws -> TextFrontendPaths { + // Tokenizer subdirectory: vocab.json + merges.txt + special_tokens.json + // + tokenizer_config.json. `downloadSubdirectory` walks the tree and + // skips files already on disk. + let tokenizerDir = repoDirectory.appendingPathComponent( + ModelNames.CosyVoice3.Sidecar.tokenizerDir) + let tokenizerRequired = [ + ModelNames.CosyVoice3.Sidecar.vocab, + ModelNames.CosyVoice3.Sidecar.merges, + ModelNames.CosyVoice3.Sidecar.specialTokens, + ] + let tokenizerPresent = tokenizerRequired.allSatisfy { name in + FileManager.default.fileExists( + atPath: tokenizerDir.appendingPathComponent(name).path) + } + if !tokenizerPresent { + logger.info("Downloading CosyVoice3 tokenizer assets…") + try await DownloadUtils.downloadSubdirectory( + .cosyvoice3, + subdirectory: ModelNames.CosyVoice3.Sidecar.tokenizerDir, + to: repoDirectory) + } + + // Runtime text-embedding table (542 MB). Pulled as a file download so + // it never has to sit in RAM during transfer. + let runtimeEmbeddings = try await ensureSidecarFile( + subdir: ModelNames.CosyVoice3.Sidecar.embeddingsDir, + name: ModelNames.CosyVoice3.Sidecar.runtimeEmbeddings, + repoDirectory: repoDirectory, + description: "CosyVoice3 runtime text embedding table (542 MB)") + + return TextFrontendPaths( + tokenizerDirectory: tokenizerDir, + runtimeEmbeddingsFile: runtimeEmbeddings, + specialTokensFile: tokenizerDir.appendingPathComponent( + ModelNames.CosyVoice3.Sidecar.specialTokens)) + } + + // MARK: - Voice bundles + + /// Ensure the requested zero-shot voice bundle (`.safetensors` + + /// `.json`) is cached. Returns the `.safetensors` URL that + /// `CosyVoice3PromptAssets.load(from:)` expects — the loader derives the + /// `.json` sidecar path from it. + @discardableResult + public static func ensureVoice( + voiceId: String = ModelNames.CosyVoice3.Sidecar.defaultVoiceId, + repoDirectory: URL + ) async throws -> URL { + let sanitized = voiceId.filter { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" } + guard !sanitized.isEmpty, sanitized == voiceId else { + throw CosyVoice3Error.invalidShape("invalid voice id: \(voiceId)") + } + + let voicesDir = repoDirectory.appendingPathComponent( + ModelNames.CosyVoice3.Sidecar.voicesDir) + try FileManager.default.createDirectory( + at: voicesDir, withIntermediateDirectories: true) + + let tensorsURL = voicesDir.appendingPathComponent("\(voiceId).safetensors") + let metadataURL = voicesDir.appendingPathComponent("\(voiceId).json") + + for (local, remoteName, desc) in [ + (tensorsURL, "\(voiceId).safetensors", "voice tensors"), + (metadataURL, "\(voiceId).json", "voice metadata"), + ] { + if FileManager.default.fileExists(atPath: local.path) { continue } + let remotePath = "\(ModelNames.CosyVoice3.Sidecar.voicesDir)/\(remoteName)" + let remoteURL = try ModelRegistry.resolveModel( + Repo.cosyvoice3.remotePath, remotePath) + let descriptor = AssetDownloader.Descriptor( + description: "\(voiceId) \(desc)", + remoteURL: remoteURL, + destinationURL: local, + transferMode: .file()) + _ = try await AssetDownloader.ensure(descriptor, logger: logger) + } + + return tensorsURL + } + + // MARK: - Helpers + + private static func ensureSidecarFile( + subdir: String, + name: String, + repoDirectory: URL, + description: String + ) async throws -> URL { + let localDir = repoDirectory.appendingPathComponent(subdir) + try FileManager.default.createDirectory( + at: localDir, withIntermediateDirectories: true) + let localURL = localDir.appendingPathComponent(name) + if FileManager.default.fileExists(atPath: localURL.path) { + return localURL + } + let remotePath = "\(subdir)/\(name)" + let remoteURL = try ModelRegistry.resolveModel( + Repo.cosyvoice3.remotePath, remotePath) + let descriptor = AssetDownloader.Descriptor( + description: description, + remoteURL: remoteURL, + destinationURL: localURL, + transferMode: .file()) + return try await AssetDownloader.ensure(descriptor, logger: logger) + } + + /// `~/.cache/fluidaudio` (macOS) / `Caches/fluidaudio` (iOS) — matches the + /// convention used by `TtsResourceDownloader` and `PocketTtsResourceDownloader`. + private static func cacheDirectory() throws -> URL { + let baseDirectory: URL + #if os(macOS) + baseDirectory = FileManager.default.homeDirectoryForCurrentUser + .appendingPathComponent(".cache") + #else + guard + let first = FileManager.default.urls( + for: .cachesDirectory, in: .userDomainMask + ).first + else { + throw CosyVoice3Error.invalidShape("failed to locate caches directory") + } + baseDirectory = first + #endif + + let cacheDirectory = baseDirectory.appendingPathComponent("fluidaudio") + if !FileManager.default.fileExists(atPath: cacheDirectory.path) { + try FileManager.default.createDirectory( + at: cacheDirectory, withIntermediateDirectories: true) + } + return cacheDirectory + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift new file mode 100644 index 000000000..a7d03a450 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift @@ -0,0 +1,62 @@ +import Foundation + +/// Central constants for the CosyVoice3 (Mandarin) CoreML pipeline. +/// +/// Shipping config (frozen): +/// - LLM-Prefill-T256-M768-fp16 +/// - LLM-Decode-M768-fp16 +/// - Flow-N250-fp32 (fp16 causes NaN; fused `layer_norm` cannot be pinned) +/// - HiFT-T500-fp16 +public enum CosyVoice3Constants { + + // MARK: - LLM shapes + public static let prefillLength = 256 + public static let kvMaxLength = 768 + public static let embedDim = 896 + public static let numLayers = 24 + public static let kvHeads = 2 + public static let headDim = 64 + + // MARK: - Flow / HiFT shapes + public static let flowTotalTokens = 250 + public static let tokenMelRatio = 2 + public static let hiftMaxFrames = 500 + public static let hiftSamplesPerFrame = 480 + public static let sampleRate = 24_000 + public static let melBins = 80 + public static let speakerEmbeddingDim = 192 + + // MARK: - Speech token vocab + public static let speechVocab = 6_761 + public static let speechTokenSize = 6_561 + public static let sosId: Int32 = 6_561 + public static let eosId: Int32 = 6_562 + public static let taskId: Int32 = 6_563 + /// Any token id in this range is treated as a stop signal. + public static let stopRange: ClosedRange = 6_561...6_760 + + // MARK: - Sampler + public static let topP: Float = 0.8 + public static let topK: Int = 25 + public static let rasWindow: Int = 10 + public static let rasTauR: Float = 0.1 + + // MARK: - Cache layout + /// Subdirectory under the shared `~/.cache/fluidaudio/` (or iOS Caches) dir + /// where every TTS backend stores its HF-mirrored models. + public static let defaultModelsSubdirectory = "Models" + + // MARK: - Files (local build dir layout) + public enum Files { + public static let llmPrefill = "LLM-Prefill-T256-M768-fp16.mlpackage" + public static let llmPrefillSubdir = "llm-fp16" + public static let llmDecode = "LLM-Decode-M768-fp16.mlpackage" + public static let llmDecodeSubdir = "llm-fp16" + public static let flow = "Flow-N250-fp32.mlpackage" + public static let flowSubdir = "flow-fp32-n250" + public static let hift = "HiFT-T500-fp16.mlpackage" + public static let hiftSubdir = "hift-fp16-t500" + public static let speechEmbeddings = "speech_embedding-fp16.safetensors" + public static let speechEmbeddingsSubdir = "embeddings" + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift new file mode 100644 index 000000000..0ebe782f5 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift @@ -0,0 +1,37 @@ +import Foundation + +/// Errors surfaced by the CosyVoice3 Swift pipeline. +public enum CosyVoice3Error: LocalizedError, Sendable { + case notInitialized + case modelFileNotFound(String) + case invalidFixture(String) + case invalidSafetensors(String) + case prefillTooLong(Int) + case sequenceTooLong(Int) + case predictionFailed(String) + case embeddingTableMissing(String) + case invalidShape(String) + + public var errorDescription: String? { + switch self { + case .notInitialized: + return "CosyVoice3 pipeline not initialized — call loadIfNeeded() first." + case .modelFileNotFound(let path): + return "CosyVoice3 model file not found at: \(path)" + case .invalidFixture(let reason): + return "Invalid CosyVoice3 fixture: \(reason)" + case .invalidSafetensors(let reason): + return "Invalid safetensors file: \(reason)" + case .prefillTooLong(let length): + return "Prefill sequence length \(length) exceeds max \(CosyVoice3Constants.prefillLength)" + case .sequenceTooLong(let length): + return "KV cache length \(length) exceeds max \(CosyVoice3Constants.kvMaxLength)" + case .predictionFailed(let stage): + return "CosyVoice3 prediction failed at stage: \(stage)" + case .embeddingTableMissing(let name): + return "CosyVoice3 embedding table missing: \(name)" + case .invalidShape(let detail): + return "CosyVoice3 shape mismatch: \(detail)" + } + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift new file mode 100644 index 000000000..4ea678f8a --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift @@ -0,0 +1,17 @@ +@preconcurrency import CoreML +import Foundation + +/// Four CoreML models for the CosyVoice3 inference pipeline. +public struct CosyVoice3Models: @unchecked Sendable { + public let prefill: MLModel + public let decode: MLModel + public let flow: MLModel + public let hift: MLModel + + public init(prefill: MLModel, decode: MLModel, flow: MLModel, hift: MLModel) { + self.prefill = prefill + self.decode = decode + self.flow = flow + self.hift = hift + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift new file mode 100644 index 000000000..b3e21bae5 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift @@ -0,0 +1,292 @@ +@preconcurrency import CoreML +import Foundation + +/// Public entry point for the CosyVoice3 (Mandarin) TTS pipeline. +/// +/// Two synthesis paths are exposed: +/// +/// 1. `synthesizeFromFixture` — Phase 1 parity harness that replays a +/// Python-generated fixture against the Swift CoreML pipeline. +/// +/// 2. `synthesize(text:promptAssets:)` — Phase 2 text-driven synthesis. The +/// user supplies a Mandarin `text` plus a `CosyVoice3PromptAssets` bundle +/// (precomputed `llm_prompt_speech_ids`, `prompt_mel`, `spk_embedding`, +/// plus the prompt text containing `<|endofprompt|>`). The manager +/// tokenizes with the on-device Qwen2 BPE tokenizer, assembles +/// `lm_input_embeds` from the mmap'd runtime embedding tables, and runs +/// prefill → decode → Flow → HiFT exactly like the fixture path. +/// +/// Text-mode requires three extra resources that must be provided at init: +/// - `tokenizerDirectory`: HuggingFace Qwen2 assets (`vocab.json` + `merges.txt`). +/// - `textEmbeddingsFile`: `embeddings-runtime-fp32.safetensors` produced by +/// `mobius/.../verify/export_runtime_embeddings.py`. Contains Qwen2 +/// `text_embedding` and CosyVoice3 `speech_embedding` rows at runtime dtype. +/// - `specialTokensFile`: JSON map `{"<|endofprompt|>": 151646, ...}` covering +/// the 281 runtime-added special tokens (CosyVoice3Tokenizer). Same format +/// that `tokenizer_fixture.json` dumps under its `special_tokens` key. +public actor CosyVoice3TtsManager { + + private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "CosyVoice3TtsManager") + + private let store: CosyVoice3ModelStore + private let tokenizerDirectory: URL? + private let textEmbeddingsFile: URL? + private let specialTokensFile: URL? + + private var synthesizer: CosyVoice3Synthesizer? + private var textFrontend: CosyVoice3TextFrontend? + + /// Fixture-only (Phase 1) constructor. + public init(directory: URL, computeUnits: MLComputeUnits = .cpuAndNeuralEngine) { + self.store = CosyVoice3ModelStore(directory: directory, computeUnits: computeUnits) + self.tokenizerDirectory = nil + self.textEmbeddingsFile = nil + self.specialTokensFile = nil + } + + /// Text-mode (Phase 2) constructor. Pass `modelsDirectory` plus the three + /// tokenizer-frontend resources. `synthesizeFromFixture` still works + /// without initializing the frontend. + public init( + modelsDirectory: URL, + tokenizerDirectory: URL, + textEmbeddingsFile: URL, + specialTokensFile: URL, + computeUnits: MLComputeUnits = .cpuAndNeuralEngine + ) { + self.store = CosyVoice3ModelStore(directory: modelsDirectory, computeUnits: computeUnits) + self.tokenizerDirectory = tokenizerDirectory + self.textEmbeddingsFile = textEmbeddingsFile + self.specialTokensFile = specialTokensFile + } + + /// Convenience factory that downloads all required assets from HuggingFace + /// (`FluidInference/CosyVoice3-0.5B-coreml`) into the shared FluidAudio + /// cache, then returns a text-mode–ready manager. + /// + /// - Parameters: + /// - cacheDirectory: Optional override for the base cache root. When + /// `nil`, uses `~/.cache/fluidaudio` (macOS) or the app Caches dir + /// (iOS) — the same location every other FluidAudio TTS backend uses. + /// - includeDefaultVoice: When `true` (default), also fetches the + /// upstream `cosyvoice3-default-zh` voice bundle so the first + /// `synthesize(...)` call works without any additional downloads. + /// - computeUnits: CoreML compute units for LLM + HiFT. Flow is forced + /// to CPU+GPU regardless (fp32 graph, ANE would NaN on fused LN). + /// - progressHandler: Forwarded to the HF downloader for UI updates. + /// - Returns: An uninitialized manager; the caller must still invoke + /// `initialize()` to compile + load models. A download of ~5.8 GB occurs + /// on first run; subsequent runs are cache hits. + public static func downloadAndCreate( + cacheDirectory: URL? = nil, + includeDefaultVoice: Bool = true, + computeUnits: MLComputeUnits = .cpuAndNeuralEngine, + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> CosyVoice3TtsManager { + let repoDir = try await CosyVoice3ResourceDownloader.ensureCoreModels( + directory: cacheDirectory, progressHandler: progressHandler) + let frontend = try await CosyVoice3ResourceDownloader.ensureTextFrontendAssets( + repoDirectory: repoDir) + if includeDefaultVoice { + _ = try await CosyVoice3ResourceDownloader.ensureVoice( + repoDirectory: repoDir) + } + return CosyVoice3TtsManager( + modelsDirectory: repoDir, + tokenizerDirectory: frontend.tokenizerDirectory, + textEmbeddingsFile: frontend.runtimeEmbeddingsFile, + specialTokensFile: frontend.specialTokensFile, + computeUnits: computeUnits) + } + + /// Ensure the given voice id (e.g. `"cosyvoice3-default-zh"` or an + /// `aishell3-zh-SSB####-{female|male}` id) is cached locally, and return + /// the loaded prompt bundle ready to pass into `synthesize(text:promptAssets:)`. + public func loadVoice( + _ voiceId: String = ModelNames.CosyVoice3.Sidecar.defaultVoiceId + ) async throws -> CosyVoice3PromptAssets { + let tensorsURL = try await CosyVoice3ResourceDownloader.ensureVoice( + voiceId: voiceId, + repoDirectory: modelsDirectory) + return try CosyVoice3PromptAssets.load(from: tensorsURL) + } + + /// Repo root directory (cache location after `downloadAndCreate(...)`). + /// Pass this to `CosyVoice3ResourceDownloader.ensureVoice(voiceId:repoDirectory:)` + /// when fetching additional voice bundles on demand. + public nonisolated var modelsDirectory: URL { + store.directory + } + + /// Load all four CoreML models + (if configured) the text frontend. + /// Idempotent. + public func initialize() async throws { + if synthesizer == nil { + try await store.loadIfNeeded() + let models = try await store.models() + let embeddingsURL = try await store.speechEmbeddingsFileURL() + let embeddings = try CosyVoice3SpeechEmbeddings(url: embeddingsURL) + self.synthesizer = CosyVoice3Synthesizer(models: models, embeddings: embeddings) + logger.info("CosyVoice3 synthesizer ready") + } + if textFrontend == nil, + let tokDir = tokenizerDirectory, + let embURL = textEmbeddingsFile, + let specURL = specialTokensFile + { + let tokStart = Date() + let specialTokens = try Self.loadSpecialTokens(url: specURL) + let tokenizer = try Qwen2BpeTokenizer.load( + directory: tokDir, specialTokens: specialTokens) + let textEmbeddings = try CosyVoice3TextEmbeddings(url: embURL) + self.textFrontend = CosyVoice3TextFrontend( + tokenizer: tokenizer, embeddings: textEmbeddings) + logger.info( + "CosyVoice3 text frontend ready in \(String(format: "%.2fs", Date().timeIntervalSince(tokStart)))" + ) + } + } + + /// Phase 1 parity entry point. + public func synthesizeFromFixture( + fixtureURL: URL, + options: CosyVoice3ParityOptions = CosyVoice3ParityOptions() + ) async throws -> CosyVoice3SynthesisResult { + guard let synthesizer = synthesizer else { + throw CosyVoice3Error.notInitialized + } + let fixture = try CosyVoice3FrontendFixture.load(from: fixtureURL) + return try await synthesizer.synthesize(fixture: fixture, options: options) + } + + /// Phase 2 text-driven synthesis. + /// + /// - Parameters: + /// - text: Mandarin (or mixed) input text. + /// - promptAssets: Bundle with prompt text + precomputed speech prompt + /// tokens + prompt mel + speaker embedding. + /// - options: Sampling / seed controls. `replayDecodedTokens` must be + /// `false` in text mode (the default here). + /// - prenormalized: When `true`, skip the built-in minimal Chinese + /// normalizer and feed `text` straight to the tokenizer. Set this if + /// you've already run wetext (or equivalent) server-side. + public func synthesize( + text: String, + promptAssets: CosyVoice3PromptAssets, + options: CosyVoice3SynthesisOptions = CosyVoice3SynthesisOptions(), + prenormalized: Bool = false + ) async throws -> CosyVoice3SynthesisResult { + guard let synthesizer = synthesizer else { + throw CosyVoice3Error.notInitialized + } + guard let frontend = textFrontend else { + throw CosyVoice3Error.notInitialized + } + + // Skip normalization if the caller set `prenormalized`, if the input + // contains SSML-ish markers (mirrors Python's `'<|' in text and '|>'` + // bypass), or if there are no CJK characters at all. + let ssmlLike = text.contains("<|") && text.contains("|>") + let normalized: String + if prenormalized || ssmlLike || !CosyVoice3ChineseNormalizer.containsChinese(text) { + normalized = text + } else { + normalized = CosyVoice3ChineseNormalizer.normalize(text) + } + + let assembled = try frontend.assemble( + promptText: promptAssets.promptText, + ttsText: normalized, + promptSpeechIds: promptAssets.promptSpeechIds) + + let lmInputEmbedsFlat = try Self.flattenLmEmbeds( + assembled.lmInputEmbeds, tPre: assembled.tPre) + + // Build an in-memory fixture adapter so we can reuse the Phase 1 + // synthesize(fixture:) path without a second code path. + let fixture = CosyVoice3FrontendFixture( + lmInputEmbeds: lmInputEmbedsFlat, + tPre: assembled.tPre, + promptSpeechIds: promptAssets.promptSpeechIds, + promptMel: promptAssets.promptMel, + promptMelFrames: promptAssets.promptMelFrames, + spkEmbedding: promptAssets.spkEmbedding, + decodedTokens: [], + seed: Int32(truncatingIfNeeded: options.seed), + numPromptMel: 0, + audioLengthSamples: 0) + + let parityOptions = CosyVoice3ParityOptions( + maxNewTokens: options.maxNewTokens, + seed: options.seed, + replayDecodedTokens: false) + + return try await synthesizer.synthesize(fixture: fixture, options: parityOptions) + } + + // MARK: - Helpers + + /// Flatten `[1, tPre, 896]` MLMultiArray fp32 into `[tPre * 896]` Float, + /// honoring non-compact strides. + private static func flattenLmEmbeds( + _ array: MLMultiArray, tPre: Int + ) throws -> [Float] { + guard + array.dataType == .float32, + array.shape.count == 3, + array.shape[0].intValue == 1, + array.shape[1].intValue == tPre, + array.shape[2].intValue == CosyVoice3Constants.embedDim + else { + throw CosyVoice3Error.invalidShape( + "lmInputEmbeds expects [1, \(tPre), \(CosyVoice3Constants.embedDim)] fp32, got shape=\(array.shape) dtype=\(array.dataType.rawValue)" + ) + } + let dim = CosyVoice3Constants.embedDim + let strides = array.strides.map { $0.intValue } + let src = array.dataPointer.bindMemory(to: Float.self, capacity: array.count) + var out = [Float](repeating: 0, count: tPre * dim) + out.withUnsafeMutableBufferPointer { dst in + for t in 0...size) + } else { + for d in 0.. [String: Int32] { + let data = try Data(contentsOf: url) + // Accept either the tokenizer_fixture.json shape + // ({"special_tokens": {...}, "cases": [...]}) or a flat map. + let json = try JSONSerialization.jsonObject(with: data) + let raw: [String: Any] + if let obj = json as? [String: Any], let nested = obj["special_tokens"] as? [String: Any] { + raw = nested + } else if let obj = json as? [String: Any] { + raw = obj + } else { + throw CosyVoice3Error.invalidShape( + "special tokens file must be a JSON object, got \(type(of: json))") + } + var out: [String: Int32] = [:] + out.reserveCapacity(raw.count) + for (k, v) in raw { + if let n = v as? Int { + out[k] = Int32(n) + } else if let n = v as? NSNumber { + out[k] = n.int32Value + } + } + guard !out.isEmpty else { + throw CosyVoice3Error.invalidShape( + "special tokens file parsed to an empty map at \(url.path)") + } + return out + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift new file mode 100644 index 000000000..53457a8c1 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift @@ -0,0 +1,145 @@ +import Foundation + +/// Minimal Mandarin text normalizer ported from CosyVoice's +/// `cosyvoice/utils/frontend_utils.py` + the Chinese branch of +/// `cosyvoice/cli/frontend.py:text_normalize`. +/// +/// **Scope (intentional):** regex-free character-level rules plus digit +/// spellout. The full `wetext.ZhNormalizer` (which rewrites years, phone +/// numbers, decimals, units, chemistry, currency, dates…) is **not** ported. +/// Callers that need production-quality TN should run wetext server-side and +/// pass the result via `synthesize(text:prenormalized: true, ...)`. +/// +/// Rules applied (in order): +/// 1. strip newlines, leading/trailing whitespace +/// 2. `replaceCornerMark` — `²` → `平方`, `³` → `立方` +/// 3. ASCII digits → 零一二三四五六七八九 (per-digit fallback; lossy vs wetext +/// but avoids raw Arabic numerals going into the BPE) +/// 4. `.` → `。`, ` - ` → `,` +/// 5. `replaceBlank` — remove spaces between CJK chars; keep spaces between +/// ASCII tokens. Runs *after* the ASCII→CJK substitutions above so +/// spaces that became CJK-interior are also cleaned up. +/// 6. `removeBracket` — drop `()【】` and backticks, `——` → space +/// 7. trailing `,` / `,` / `、` sequences → `。` +public enum CosyVoice3ChineseNormalizer { + + public static func normalize(_ text: String) -> String { + var s = text + s = s.replacingOccurrences(of: "\n", with: "") + s = s.trimmingCharacters(in: .whitespaces) + s = replaceCornerMark(s) + s = spellOutDigitsZh(s) + s = s.replacingOccurrences(of: ".", with: "。") + s = s.replacingOccurrences(of: " - ", with: ",") + s = replaceBlank(s) + s = removeBracket(s) + s = stripTrailingCommaLikes(s) + return s + } + + /// True if `text` contains at least one CJK Unified Ideograph + /// (U+4E00..U+9FFF), matching `contains_chinese` in frontend_utils.py. + public static func containsChinese(_ text: String) -> Bool { + for scalar in text.unicodeScalars where (0x4E00...0x9FFF).contains(scalar.value) { + return true + } + return false + } + + /// True if `text` is empty or consists only of Unicode punctuation / + /// symbol characters. Mirrors `is_only_punctuation`. + public static func isOnlyPunctuation(_ text: String) -> Bool { + if text.isEmpty { return true } + let allowed: CharacterSet = { + var s = CharacterSet.punctuationCharacters + s.formUnion(.symbols) + s.formUnion(.whitespaces) + return s + }() + for scalar in text.unicodeScalars where !allowed.contains(scalar) { + return false + } + return true + } + + // MARK: - Individual rules + + /// Drop spaces between non-ASCII chars; keep spaces that sit between two + /// ASCII tokens (e.g. "hello world" stays, "中 国" → "中国"). + static func replaceBlank(_ text: String) -> String { + let chars = Array(text) + var out: [Character] = [] + out.reserveCapacity(chars.count) + for i in 0.. 0 ? chars[i - 1] : Character(" ") + let next = i + 1 < chars.count ? chars[i + 1] : Character(" ") + let prevOk = prev.isASCII && prev != " " + let nextOk = next.isASCII && next != " " + if prevOk && nextOk { + out.append(c) + } + } else { + out.append(c) + } + } + return String(out) + } + + static func replaceCornerMark(_ text: String) -> String { + var s = text + s = s.replacingOccurrences(of: "²", with: "平方") + s = s.replacingOccurrences(of: "³", with: "立方") + return s + } + + static func removeBracket(_ text: String) -> String { + var s = text + s = s.replacingOccurrences(of: "(", with: "") + s = s.replacingOccurrences(of: ")", with: "") + s = s.replacingOccurrences(of: "【", with: "") + s = s.replacingOccurrences(of: "】", with: "") + s = s.replacingOccurrences(of: "`", with: "") + s = s.replacingOccurrences(of: "——", with: " ") + return s + } + + /// Replace each ASCII digit in `text` with its Chinese reading. Lossy + /// per-digit fallback (e.g. `2024` → `二零二四`); correct for years / IDs + /// but wrong for decimals or large cardinals. Acceptable as a placeholder + /// while wetext remains server-side. + static func spellOutDigitsZh(_ text: String) -> String { + let map: [Character: String] = [ + "0": "零", "1": "一", "2": "二", "3": "三", "4": "四", + "5": "五", "6": "六", "7": "七", "8": "八", "9": "九", + ] + var out = "" + out.reserveCapacity(text.count) + for ch in text { + if let zh = map[ch] { + out += zh + } else { + out.append(ch) + } + } + return out + } + + /// Collapse a run of trailing `,` / `,` / `、` into a single `。`. + /// Equivalent to the Python `re.sub(r'[,,、]+$', '。', text)` rule. + static func stripTrailingCommaLikes(_ text: String) -> String { + let commaLikes: Set = [",", ",", "、"] + var chars = Array(text) + var end = chars.count + while end > 0, commaLikes.contains(chars[end - 1]) { + end -= 1 + } + if end == chars.count { + return text + } + chars = Array(chars[0.. CosyVoice3FrontendFixture { + let file = try SafetensorsFile(url: url) + + let lmInfo = try file.info("lm_input_embeds") + guard + lmInfo.dtype == .f32, + lmInfo.shape.count == 3, + lmInfo.shape[0] == 1, + lmInfo.shape[2] == CosyVoice3Constants.embedDim + else { + throw CosyVoice3Error.invalidFixture( + "lm_input_embeds expects [1, t_pre, 896] fp32, got shape=\(lmInfo.shape) dtype=\(lmInfo.dtype.rawValue)" + ) + } + let lmInputEmbeds = try file.asFloat32("lm_input_embeds") + let tPre = lmInfo.shape[1] + guard tPre > 0 && tPre <= CosyVoice3Constants.prefillLength else { + throw CosyVoice3Error.prefillTooLong(tPre) + } + + let promptIdsInfo = try file.info("llm_prompt_speech_ids") + guard + promptIdsInfo.shape.count == 2, + promptIdsInfo.shape[0] == 1 + else { + throw CosyVoice3Error.invalidFixture( + "llm_prompt_speech_ids expects [1, N], got \(promptIdsInfo.shape)") + } + let promptSpeechIds = try file.asInt32("llm_prompt_speech_ids") + + let promptMelInfo = try file.info("prompt_mel") + guard + promptMelInfo.dtype == .f32, + promptMelInfo.shape.count == 3, + promptMelInfo.shape[0] == 1, + promptMelInfo.shape[2] == CosyVoice3Constants.melBins + else { + throw CosyVoice3Error.invalidFixture( + "prompt_mel expects [1, frames, 80] fp32, got \(promptMelInfo.shape)") + } + let promptMel = try file.asFloat32("prompt_mel") + let promptMelFrames = promptMelInfo.shape[1] + + let spkInfo = try file.info("spk_embedding") + guard + spkInfo.dtype == .f32, + spkInfo.shape == [1, CosyVoice3Constants.speakerEmbeddingDim] + else { + throw CosyVoice3Error.invalidFixture( + "spk_embedding expects [1, 192] fp32, got \(spkInfo.shape)") + } + let spkEmbedding = try file.asFloat32("spk_embedding") + + let decodedTokens = try file.asInt32("decoded_tokens") + let seedValue = try file.asInt32("seed").first ?? 0 + + let numPromptMel = try file.asInt("num_prompt_mel") + let audioLengthSamples = try file.asInt("audio_length_samples") + + return CosyVoice3FrontendFixture( + lmInputEmbeds: lmInputEmbeds, + tPre: tPre, + promptSpeechIds: promptSpeechIds, + promptMel: promptMel, + promptMelFrames: promptMelFrames, + spkEmbedding: spkEmbedding, + decodedTokens: decodedTokens, + seed: seedValue, + numPromptMel: numPromptMel, + audioLengthSamples: audioLengthSamples) + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift new file mode 100644 index 000000000..0c10cd203 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift @@ -0,0 +1,115 @@ +import Foundation + +/// Zero-shot prompt assets bundled alongside CosyVoice3 inference. +/// +/// Phase 2 keeps SpeechTokenizer and CAMPPlus Python-side: `llmPromptSpeechIds` +/// and `spkEmbedding` are precomputed from a reference prompt WAV and shipped +/// as a single safetensors file with a JSON sidecar carrying the prompt text. +/// A later phase will regenerate these on-device once the SpeechTokenizer and +/// CAMPPlus DSPs + CoreML bindings land. +/// +/// The shipping layout mirrors what +/// `verify/export_swift_fixture.py` produces, so the Phase 1 fixture doubles +/// as a valid prompt-assets bundle: +/// +/// ``` +/// .safetensors +/// llm_prompt_speech_ids int32 [1, N_speech] +/// prompt_mel float32 [1, 2*N_speech, 80] +/// spk_embedding float32 [1, 192] +/// (any other tensors are ignored) +/// .json +/// { "prompt_text": "...", "tts_text": "..." } +/// ``` +public struct CosyVoice3PromptAssets: Sendable { + + /// Prompt text seed. MUST contain `<|endofprompt|>` (id 151646). + public let promptText: String + + /// Discrete speech token prefix fed to Flow (`token_total[:, :N_speech]`) + /// AND used to build the LLM prefill embed table. + public let promptSpeechIds: [Int32] + + /// Mel frames computed from the prompt WAV (`[1, 2*N_speech, 80]` fp32). + /// Flattened row-major `[frames * 80]`; `promptMelFrames` is the frame count. + public let promptMel: [Float] + public let promptMelFrames: Int + + /// CAMPPlus speaker embedding for the prompt voice (`[1, 192]` fp32). + public let spkEmbedding: [Float] + + public init( + promptText: String, + promptSpeechIds: [Int32], + promptMel: [Float], + promptMelFrames: Int, + spkEmbedding: [Float] + ) { + self.promptText = promptText + self.promptSpeechIds = promptSpeechIds + self.promptMel = promptMel + self.promptMelFrames = promptMelFrames + self.spkEmbedding = spkEmbedding + } + + /// Load from `.safetensors` + `.json` sidecar. + /// + /// - Parameter url: URL to the `.safetensors` file. The sidecar is expected + /// next to it with the same basename and `.json` extension. + public static func load(from url: URL) throws -> CosyVoice3PromptAssets { + let file = try SafetensorsFile(url: url) + + let idsInfo = try file.info("llm_prompt_speech_ids") + guard idsInfo.shape.count == 2, idsInfo.shape[0] == 1 else { + throw CosyVoice3Error.invalidFixture( + "llm_prompt_speech_ids expects [1, N], got \(idsInfo.shape)") + } + let promptSpeechIds = try file.asInt32("llm_prompt_speech_ids") + + let melInfo = try file.info("prompt_mel") + guard + melInfo.dtype == .f32, + melInfo.shape.count == 3, + melInfo.shape[0] == 1, + melInfo.shape[2] == CosyVoice3Constants.melBins + else { + throw CosyVoice3Error.invalidFixture( + "prompt_mel expects [1, frames, 80] fp32, got \(melInfo.shape)") + } + let promptMel = try file.asFloat32("prompt_mel") + let promptMelFrames = melInfo.shape[1] + + let spkInfo = try file.info("spk_embedding") + guard + spkInfo.dtype == .f32, + spkInfo.shape == [1, CosyVoice3Constants.speakerEmbeddingDim] + else { + throw CosyVoice3Error.invalidFixture( + "spk_embedding expects [1, 192] fp32, got \(spkInfo.shape)") + } + let spkEmbedding = try file.asFloat32("spk_embedding") + + let sidecarURL = url.deletingPathExtension().appendingPathExtension("json") + guard FileManager.default.fileExists(atPath: sidecarURL.path) else { + throw CosyVoice3Error.invalidFixture( + "prompt sidecar JSON not found next to \(url.lastPathComponent) — expected \(sidecarURL.lastPathComponent)" + ) + } + struct Sidecar: Decodable { let prompt_text: String } + let sidecar: Sidecar + do { + sidecar = try JSONDecoder().decode( + Sidecar.self, from: try Data(contentsOf: sidecarURL)) + } catch { + throw CosyVoice3Error.invalidFixture( + "failed to decode \(sidecarURL.lastPathComponent): \(error)") + } + + return CosyVoice3PromptAssets( + promptText: sidecar.prompt_text, + promptSpeechIds: promptSpeechIds, + promptMel: promptMel, + promptMelFrames: promptMelFrames, + spkEmbedding: spkEmbedding) + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift new file mode 100644 index 000000000..0af4d89bf --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift @@ -0,0 +1,307 @@ +import Accelerate +import Foundation + +/// On-device mel spectrogram extractor for CosyVoice3 prompt audio. +/// +/// Matches `matcha.utils.audio.mel_spectrogram` invoked from +/// `cosyvoice/cli/frontend.py:_extract_speech_feat` with the CosyVoice3 config +/// (see `examples/libritts/cosyvoice3/conf/cosyvoice3.yaml`): +/// +/// ``` +/// n_fft: 1920 +/// num_mels: 80 +/// sampling_rate: 24000 +/// hop_size: 480 +/// win_size: 1920 +/// fmin: 0 +/// fmax: null (→ sampling_rate / 2 = 12000 per librosa default) +/// center: False +/// ``` +/// +/// Pipeline (verbatim from the Python reference): +/// 1. reflect-pad the waveform by `(n_fft - hop_size) / 2 = 720` on each side +/// 2. framed STFT with `n_fft=1920, hop=480, win=1920`, periodic Hann window +/// (`torch.hann_window` default), `center=False` +/// 3. magnitude = `sqrt(real² + imag² + 1e-9)` (Matcha convention) +/// 4. `mel = mel_basis @ magnitude` using Slaney-normalized mel filterbank +/// (librosa default: HTK=False, norm='slaney') +/// 5. `log_mel = log(clamp(mel, min=1e-5))` +/// +/// The output is flattened `[T, 80]` row-major fp32, which is the layout +/// `CosyVoice3PromptAssets.promptMel` stores and the Flow model consumes as +/// `[1, 2*N_speech, 80]` after slicing to match the prompt-speech id count. +/// +/// Use `trimToTokenRatio(...)` to enforce the `frames == 2 * N_speech` +/// invariant before passing to Flow (matches the +/// `speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len` +/// clamp in the Python frontend). +public final class CosyVoice3PromptMel: @unchecked Sendable { + + public static let sampleRate = 24_000 + public static let nFFT = 1_920 + public static let hopSize = 480 + public static let winSize = 1_920 + public static let numMels = 80 + public static let fMin: Float = 0 + public static let fMax: Float = 12_000 // sr / 2 + /// Reflect-pad each side by `(n_fft - hop_size) / 2`. + public static let padLength = (nFFT - hopSize) / 2 // 720 + /// Magnitude epsilon before sqrt (prevents NaN gradients in training; kept + /// here for bit parity with the reference). + private static let magEps: Float = 1e-9 + /// Log floor clamp applied inside `log(clamp(x, min=1e-5))`. + private static let logFloor: Float = 1e-5 + + // Precomputed resources + private let hannWindow: [Float] + private let melBasis: [Float] // flat [numMels * numFreqBins] + private let numFreqBins: Int + private var fftSetup: vDSP_DFT_Setup? + + // Reusable buffers (not thread-safe; wrap with a queue if shared). + private var frameBuf: [Float] + private var realIn: [Float] + private var imagIn: [Float] + private var realOut: [Float] + private var imagOut: [Float] + private var magnitude: [Float] + private var imagSq: [Float] + + public init() { + self.numFreqBins = Self.nFFT / 2 + 1 + // torch.hann_window(N) defaults to periodic=True — sample i of length + // N is `0.5 * (1 - cos(2πi/N))`. This matches Matcha's code path via + // the torch.stft default. + self.hannWindow = Self.hannWindowPeriodic(length: Self.winSize) + self.melBasis = Self.buildSlaneyMelBasis( + sampleRate: Self.sampleRate, + nFFT: Self.nFFT, + numMels: Self.numMels, + fMin: Self.fMin, + fMax: Self.fMax) + self.fftSetup = vDSP_DFT_zop_CreateSetup(nil, vDSP_Length(Self.nFFT), .FORWARD) + self.frameBuf = [Float](repeating: 0, count: Self.nFFT) + self.realIn = [Float](repeating: 0, count: Self.nFFT) + self.imagIn = [Float](repeating: 0, count: Self.nFFT) + self.realOut = [Float](repeating: 0, count: Self.nFFT) + self.imagOut = [Float](repeating: 0, count: Self.nFFT) + self.magnitude = [Float](repeating: 0, count: numFreqBins) + self.imagSq = [Float](repeating: 0, count: numFreqBins) + } + + deinit { + if let setup = fftSetup { + vDSP_DFT_DestroySetup(setup) + } + } + + public struct Result: Sendable { + /// `[frames * numMels]` row-major, fp32. + public let mel: [Float] + public let frames: Int + } + + /// Compute the log-mel spectrogram for a 24 kHz mono waveform. + /// + /// - Parameter audio: fp32 PCM samples at 24 kHz, range ≈ [-1, 1]. + /// - Returns: `[T * 80]` row-major fp32 mel, where + /// `T = floor((len + 2·padLength - nFFT) / hopSize) + 1`. + public func compute(audio: [Float]) throws -> Result { + guard let setup = fftSetup else { + throw CosyVoice3Error.invalidShape("vDSP_DFT setup failed") + } + guard audio.count > 0 else { + return Result(mel: [], frames: 0) + } + + let padded = Self.reflectPad(audio, pad: Self.padLength) + let paddedCount = padded.count + let frames = max(0, (paddedCount - Self.nFFT) / Self.hopSize + 1) + guard frames > 0 else { + return Result(mel: [], frames: 0) + } + + var mel = [Float](repeating: 0, count: frames * Self.numMels) + + for frameIdx in 0...size) + } + } + vDSP_vclr(&imagIn, 1, vDSP_Length(Self.nFFT)) + vDSP_DFT_Execute(setup, realIn, imagIn, &realOut, &imagOut) + + // magnitude = sqrt(real² + imag² + 1e-9) over one-sided bins. + vDSP_vsq(realOut, 1, &magnitude, 1, vDSP_Length(numFreqBins)) + vDSP_vsq(imagOut, 1, &imagSq, 1, vDSP_Length(numFreqBins)) + vDSP_vadd(magnitude, 1, imagSq, 1, &magnitude, 1, vDSP_Length(numFreqBins)) + var eps = Self.magEps + vDSP_vsadd(magnitude, 1, &eps, &magnitude, 1, vDSP_Length(numFreqBins)) + var n = Int32(numFreqBins) + vvsqrtf(&magnitude, magnitude, &n) + + // mel = melBasis[80, numFreqBins] @ magnitude[numFreqBins] + var melFrame = [Float](repeating: 0, count: Self.numMels) + melBasis.withUnsafeBufferPointer { basisPtr in + magnitude.withUnsafeBufferPointer { magPtr in + melFrame.withUnsafeMutableBufferPointer { outPtr in + vDSP_mmul( + basisPtr.baseAddress!, 1, + magPtr.baseAddress!, 1, + outPtr.baseAddress!, 1, + vDSP_Length(Self.numMels), + vDSP_Length(1), + vDSP_Length(numFreqBins)) + } + } + } + + // log(clamp(x, min=1e-5)) + for m in 0.. (mel: [Float], frames: Int) { + let targetFrames = 2 * tokenCount + guard frames >= targetFrames else { + throw CosyVoice3Error.invalidShape( + "prompt mel has \(frames) frames but tokenCount=\(tokenCount) requires \(targetFrames)" + ) + } + if frames == targetFrames { + return (mel, frames) + } + let trimmed = Array(mel.prefix(targetFrames * numMels)) + return (trimmed, targetFrames) + } + + // MARK: - Helpers + + /// PyTorch `F.pad(..., mode="reflect")` on a 1-D signal: + /// - left: [y[pad], y[pad-1], ..., y[1]] + /// - core: y[0.. [Float] { + let n = y.count + if pad <= 0 { return y } + // PyTorch requires pad < n for reflect. Guard loudly for a silently + // bad prompt (very short audio). + precondition(pad < n, "reflect pad=\(pad) requires signal length > \(pad), got \(n)") + var out = [Float](repeating: 0, count: n + 2 * pad) + for i in 0.. [Float] { + var w = [Float](repeating: 0, count: length) + let divisor = Float(length) + for i in 0.. [Float] { + let numFreqBins = nFFT / 2 + 1 + + let melMin = hzToMelSlaney(fMin) + let melMax = hzToMelSlaney(fMax) + + var melPoints = [Float](repeating: 0, count: numMels + 2) + for i in 0..<(numMels + 2) { + let mel = melMin + Float(i) * (melMax - melMin) / Float(numMels + 1) + melPoints[i] = melToHzSlaney(mel) + } + + var fftFreqs = [Float](repeating: 0, count: numFreqBins) + for i in 0..= fLeft && freq < fCenter { + w = norm * (freq - fLeft) / (fCenter - fLeft) + } else if freq >= fCenter && freq <= fRight { + w = norm * (fRight - freq) / (fRight - fCenter) + } + basis[m * numFreqBins + f] = w + } + } + return basis + } + + static func hzToMelSlaney(_ hz: Float) -> Float { + let fSp: Float = 200.0 / 3.0 + let minLogHz: Float = 1_000.0 + let minLogMel: Float = minLogHz / fSp + let logStep: Float = log(6.4) / 27.0 + return hz >= minLogHz + ? minLogMel + log(hz / minLogHz) / logStep + : hz / fSp + } + + static func melToHzSlaney(_ mel: Float) -> Float { + let fSp: Float = 200.0 / 3.0 + let minLogHz: Float = 1_000.0 + let minLogMel: Float = minLogHz / fSp + let logStep: Float = log(6.4) / 27.0 + return mel >= minLogMel + ? minLogHz * exp(logStep * (mel - minLogMel)) + : fSp * mel + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift new file mode 100644 index 000000000..809912ce5 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift @@ -0,0 +1,142 @@ +@preconcurrency import CoreML +import Foundation + +/// mmap'd reader for Qwen2 `text_embedding` [151936, 896] and CosyVoice3 +/// `speech_embedding` [6761, 896] tables (both fp32). Used by the Phase 2 +/// text frontend to assemble `lm_input_embeds` natively in Swift. +/// +/// The Phase 1 per-step decode embedding path still uses +/// `CosyVoice3SpeechEmbeddings` (fp16 table) to save memory during long +/// autoregressive loops; that code remains unchanged. +public final class CosyVoice3TextEmbeddings: @unchecked Sendable { + + private let file: SafetensorsFile + private let textBytes: Data + private let speechBytes: Data + public let textVocab: Int + public let speechVocab: Int + public let embedDim: Int + + public init(url: URL) throws { + let file = try SafetensorsFile(url: url) + guard let text = file.tensors["text_embedding"] else { + throw CosyVoice3Error.embeddingTableMissing("text_embedding") + } + guard let speech = file.tensors["speech_embedding"] else { + throw CosyVoice3Error.embeddingTableMissing("speech_embedding") + } + guard text.dtype == .f32, text.shape.count == 2 else { + throw CosyVoice3Error.invalidShape( + "text_embedding expects [vocab, 896] fp32, got shape=\(text.shape) dtype=\(text.dtype.rawValue)" + ) + } + guard speech.dtype == .f32, speech.shape.count == 2 else { + throw CosyVoice3Error.invalidShape( + "speech_embedding expects [vocab, 896] fp32, got shape=\(speech.shape) dtype=\(speech.dtype.rawValue)" + ) + } + guard text.shape[1] == speech.shape[1] else { + throw CosyVoice3Error.invalidShape( + "text_embedding dim=\(text.shape[1]) != speech_embedding dim=\(speech.shape[1])" + ) + } + self.file = file + self.textBytes = try file.rawBytes("text_embedding") + self.speechBytes = try file.rawBytes("speech_embedding") + self.textVocab = text.shape[0] + self.speechVocab = speech.shape[0] + self.embedDim = text.shape[1] + guard self.embedDim == CosyVoice3Constants.embedDim else { + throw CosyVoice3Error.invalidShape( + "embed_dim=\(embedDim) does not match CosyVoice3Constants.embedDim=\(CosyVoice3Constants.embedDim)" + ) + } + } + + /// Assemble LLM-Prefill input: + /// `lm_input = concat([sos, text_embedding[text_ids], task_id, speech_embedding[prompt_speech_ids]], dim=1)` + /// + /// Returns a `[1, T_pre, 896]` fp32 MLMultiArray and `T_pre = 1 + N_text + 1 + N_speech`. + /// The LLM-Prefill model expects T padded to 256; this method returns the + /// unpadded tensor — callers must pad or pass `T_pre` separately. + public func assembleLmInput( + textTokenIds: [Int32], + promptSpeechIds: [Int32], + sos: Int32 = CosyVoice3Constants.sosId, + taskId: Int32 = CosyVoice3Constants.taskId + ) throws -> (embeds: MLMultiArray, tPre: Int) { + let nText = textTokenIds.count + let nSpeech = promptSpeechIds.count + let tPre = 1 + nText + 1 + nSpeech + let dim = embedDim + let array = try MLMultiArray( + shape: [1, NSNumber(value: tPre), NSNumber(value: dim)], + dataType: .float32) + let strides = array.strides.map { $0.intValue } + let dst = array.dataPointer.bindMemory(to: Float.self, capacity: array.count) + + // Row t (within the T_pre axis) → destination pointer. + func row(_ t: Int) -> UnsafeMutablePointer { + dst.advanced(by: t * strides[1]) + } + + // 1) sos + try copySpeechRow(sos, into: row(0), stride: strides[2]) + // 2) text_embedding[text_ids] + for (i, id) in textTokenIds.enumerated() { + try copyTextRow(id, into: row(1 + i), stride: strides[2]) + } + // 3) task_id + try copySpeechRow(taskId, into: row(1 + nText), stride: strides[2]) + // 4) speech_embedding[prompt_speech_ids] + for (i, id) in promptSpeechIds.enumerated() { + try copySpeechRow(id, into: row(1 + nText + 1 + i), stride: strides[2]) + } + + return (array, tPre) + } + + // MARK: - Row copy + + private func copyTextRow( + _ id: Int32, into dst: UnsafeMutablePointer, stride: Int + ) throws { + guard id >= 0 && Int(id) < textVocab else { + throw CosyVoice3Error.invalidShape( + "text token id \(id) out of range [0, \(textVocab))") + } + let rowStart = Int(id) * embedDim * 4 + textBytes.withUnsafeBytes { src in + let basePtr = src.baseAddress!.advanced(by: rowStart) + .assumingMemoryBound(to: Float.self) + if stride == 1 { + memcpy(dst, basePtr, embedDim * 4) + } else { + for i in 0.., stride: Int + ) throws { + guard id >= 0 && Int(id) < speechVocab else { + throw CosyVoice3Error.invalidShape( + "speech token id \(id) out of range [0, \(speechVocab))") + } + let rowStart = Int(id) * embedDim * 4 + speechBytes.withUnsafeBytes { src in + let basePtr = src.baseAddress!.advanced(by: rowStart) + .assumingMemoryBound(to: Float.self) + if stride == 1 { + memcpy(dst, basePtr, embedDim * 4) + } else { + for i in 0..` token + /// (id 151646). The Python pipeline asserts this in + /// `cosyvoice/llm.py:478`. + public func assemble( + promptText: String, + ttsText: String, + promptSpeechIds: [Int32] + ) throws -> Assembled { + let promptIds = tokenizer.encode(promptText) + let ttsIds = tokenizer.encode(ttsText) + // Python asserts 151646 is present somewhere in the combined token + // stream. Enforce here to avoid silent parity breakage. + let endOfPrompt: Int32 = 151_646 + guard promptIds.contains(endOfPrompt) || ttsIds.contains(endOfPrompt) else { + throw CosyVoice3Error.invalidShape( + "<|endofprompt|> (id 151646) not present in promptText or ttsText") + } + let combined = promptIds + ttsIds + + let (embeds, tPre) = try embeddings.assembleLmInput( + textTokenIds: combined, + promptSpeechIds: promptSpeechIds) + guard tPre <= CosyVoice3Constants.prefillLength else { + throw CosyVoice3Error.invalidShape( + "assembled T_pre=\(tPre) exceeds LLM-Prefill length \(CosyVoice3Constants.prefillLength)" + ) + } + return Assembled(lmInputEmbeds: embeds, tPre: tPre, textTokenIds: combined) + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift new file mode 100644 index 000000000..0154ecae1 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift @@ -0,0 +1,230 @@ +import Foundation + +/// Qwen2 byte-level BPE tokenizer. Mirrors +/// `transformers.models.qwen2.tokenization_qwen2.Qwen2Tokenizer` on the slow +/// path used by CosyVoice3 (`AutoTokenizer.from_pretrained(...)` + runtime +/// `add_special_tokens(...)` as done in `CosyVoice3Tokenizer`). +/// +/// Encoding pipeline: +/// 1. Split input on registered special tokens (longest-match first). Special +/// chunks map 1:1 to their fixed ID. +/// 2. Pretokenize non-special chunks with Qwen2's regex. +/// 3. UTF-8 encode each match and remap bytes via `Qwen2ByteEncoder`. +/// 4. Apply BPE merges (lowest rank wins, all occurrences merged per pass). +/// 5. Look up the resulting symbols in `vocab.json` to get token IDs. +/// +/// Loader accepts the standard HuggingFace asset layout: +/// /vocab.json — {"symbol": id, ...} +/// /merges.txt — first line is a header or the first merge; +/// subsequent lines are "A B" pairs, rank = line idx. +/// Special tokens are passed in separately (from a JSON map exported alongside +/// the CosyVoice3 fixtures — the runtime add_special_tokens list in Python is +/// not encoded in the HF assets). +public final class Qwen2BpeTokenizer: @unchecked Sendable { + + public enum Error: Swift.Error, LocalizedError { + case fileNotFound(URL) + case invalidJSON(String) + case missingField(String) + case regexCompileFailed + + public var errorDescription: String? { + switch self { + case .fileNotFound(let url): return "file not found: \(url.path)" + case .invalidJSON(let m): return "invalid JSON: \(m)" + case .missingField(let f): return "missing field: \(f)" + case .regexCompileFailed: return "failed to compile pretokenize regex" + } + } + } + + /// Qwen2 pretokenize regex (see `transformers` PRETOKENIZE_REGEX). + /// Matches: contractions, letter words, single digits, punctuation runs, + /// newline-led whitespace, trailing whitespace. + public static let pretokenizePattern = + #"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"# + + private let vocab: [String: Int32] + private let mergeRanks: [String: Int] // "firstSpace second" -> rank + private let specialTokens: [String: Int32] + private let specialPattern: NSRegularExpression? + private let pretokenizeRegex: NSRegularExpression + + public init( + vocab: [String: Int32], + merges: [(String, String)], + specialTokens: [String: Int32] + ) throws { + self.vocab = vocab + var ranks: [String: Int] = [:] + ranks.reserveCapacity(merges.count) + for (i, pair) in merges.enumerated() { + ranks["\(pair.0) \(pair.1)"] = i + } + self.mergeRanks = ranks + self.specialTokens = specialTokens + + if !specialTokens.isEmpty { + // Longest-first so `<|endofprompt|>` wins over `<|end`. + let ordered = specialTokens.keys.sorted { $0.count > $1.count } + let alternation = ordered.map { NSRegularExpression.escapedPattern(for: $0) } + .joined(separator: "|") + self.specialPattern = try NSRegularExpression(pattern: alternation) + } else { + self.specialPattern = nil + } + + do { + self.pretokenizeRegex = try NSRegularExpression(pattern: Self.pretokenizePattern) + } catch { + throw Error.regexCompileFailed + } + } + + /// Load vocab.json + merges.txt from a directory and attach the runtime + /// special-token map (must be supplied externally; Python `AutoTokenizer` + /// adds these at import time via `add_special_tokens`). + public static func load( + directory: URL, + specialTokens: [String: Int32] + ) throws -> Qwen2BpeTokenizer { + let vocabURL = directory.appendingPathComponent("vocab.json") + let mergesURL = directory.appendingPathComponent("merges.txt") + guard FileManager.default.fileExists(atPath: vocabURL.path) else { + throw Error.fileNotFound(vocabURL) + } + guard FileManager.default.fileExists(atPath: mergesURL.path) else { + throw Error.fileNotFound(mergesURL) + } + + let vocabData = try Data(contentsOf: vocabURL) + guard let raw = try JSONSerialization.jsonObject(with: vocabData) as? [String: Int] else { + throw Error.invalidJSON("vocab.json is not {String: Int}") + } + var vocab: [String: Int32] = [:] + vocab.reserveCapacity(raw.count) + for (k, v) in raw { vocab[k] = Int32(v) } + + let mergesText = try String(contentsOf: mergesURL, encoding: .utf8) + var merges: [(String, String)] = [] + merges.reserveCapacity(140_000) + var isFirst = true + for line in mergesText.split(separator: "\n", omittingEmptySubsequences: true) { + if isFirst { + isFirst = false + // Typical merges.txt header: "#version: 0.2". Skip it. + if line.hasPrefix("#") { continue } + } + let parts = line.split(separator: " ", maxSplits: 1) + guard parts.count == 2 else { continue } + merges.append((String(parts[0]), String(parts[1]))) + } + + return try Qwen2BpeTokenizer(vocab: vocab, merges: merges, specialTokens: specialTokens) + } + + /// Encode text to token IDs. + public func encode(_ text: String) -> [Int32] { + var out: [Int32] = [] + splitBySpecial(text) { chunk, isSpecial in + if isSpecial { + if let id = specialTokens[chunk] { out.append(id) } + return + } + pretokenize(chunk) { piece in + let mapped = Qwen2ByteEncoder.encode(piece.utf8) + let bpeTokens = bpe(mapped) + for tok in bpeTokens { + if let id = vocab[tok] { + out.append(id) + } else if let id = specialTokens[tok] { + out.append(id) + } + // Unknown token: Qwen2 has no . Drop silently as + // upstream never produces one for valid UTF-8 input. + } + } + } + return out + } + + // MARK: - Special token split + + private func splitBySpecial(_ text: String, _ handle: (String, Bool) -> Void) { + guard let regex = specialPattern, !text.isEmpty else { + if !text.isEmpty { handle(text, false) } + return + } + let ns = text as NSString + let range = NSRange(location: 0, length: ns.length) + var cursor = 0 + regex.enumerateMatches(in: text, options: [], range: range) { match, _, _ in + guard let m = match else { return } + if m.range.location > cursor { + let sub = ns.substring(with: NSRange(location: cursor, length: m.range.location - cursor)) + if !sub.isEmpty { handle(sub, false) } + } + handle(ns.substring(with: m.range), true) + cursor = m.range.location + m.range.length + } + if cursor < ns.length { + let sub = ns.substring(with: NSRange(location: cursor, length: ns.length - cursor)) + if !sub.isEmpty { handle(sub, false) } + } + } + + // MARK: - Pretokenize + + private func pretokenize(_ text: String, _ handle: (String) -> Void) { + guard !text.isEmpty else { return } + let ns = text as NSString + let range = NSRange(location: 0, length: ns.length) + pretokenizeRegex.enumerateMatches(in: text, options: [], range: range) { match, _, _ in + guard let m = match else { return } + if m.range.length > 0 { + handle(ns.substring(with: m.range)) + } + } + } + + // MARK: - BPE + + /// Standard GPT-2 BPE: repeatedly merge the lowest-rank adjacent pair + /// until no pair is mergeable, then return the final symbol list. + private func bpe(_ text: String) -> [String] { + if text.isEmpty { return [] } + var symbols = text.map { String($0) } + if symbols.count < 2 { return symbols } + + while true { + var bestRank = Int.max + var bestIndex = -1 + for i in 0..<(symbols.count - 1) { + let key = "\(symbols[i]) \(symbols[i + 1])" + if let r = mergeRanks[key], r < bestRank { + bestRank = r + bestIndex = i + } + } + if bestIndex < 0 { break } + + let first = symbols[bestIndex] + let second = symbols[bestIndex + 1] + var merged: [String] = [] + merged.reserveCapacity(symbols.count - 1) + var i = 0 + while i < symbols.count { + if i < symbols.count - 1 && symbols[i] == first && symbols[i + 1] == second { + merged.append(first + second) + i += 2 + } else { + merged.append(symbols[i]) + i += 1 + } + } + symbols = merged + if symbols.count < 2 { break } + } + return symbols + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2ByteEncoder.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2ByteEncoder.swift new file mode 100644 index 000000000..7a1116b9d --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2ByteEncoder.swift @@ -0,0 +1,56 @@ +import Foundation + +/// GPT-2 style reversible byte-to-unicode mapping used by Qwen2 BPE. +/// +/// Mirrors `transformers.models.qwen2.tokenization_qwen2.bytes_to_unicode`: +/// - Printable ASCII, Latin-1 supplement (¡..¬), and (®..ÿ) map to themselves. +/// - The 68 "unprintable" bytes are remapped to code points 256..323. +/// +/// After mapping, every byte of a UTF-8 string becomes a single-code-point +/// unicode character that vocab/merges.txt expect. +public enum Qwen2ByteEncoder { + + /// byte (0..255) → single Unicode scalar. + public static let byteToUnicode: [Character] = { + var map = [Character](repeating: Character(" "), count: 256) + var printable = [Int]() + printable.reserveCapacity(188) + printable.append(contentsOf: Int(Character("!").asciiValue!)...Int(Character("~").asciiValue!)) + printable.append(contentsOf: 0xA1...0xAC) + printable.append(contentsOf: 0xAE...0xFF) + + for b in printable { + map[b] = Character(UnicodeScalar(b)!) + } + + var extra = 0 + for b in 0..<256 { + if !printable.contains(b) { + let scalar = UnicodeScalar(256 + extra)! + map[b] = Character(scalar) + extra += 1 + } + } + return map + }() + + /// Inverse table: Unicode scalar value → byte (0..255). Built lazily. + public static let unicodeToByte: [UInt32: UInt8] = { + var dict: [UInt32: UInt8] = [:] + dict.reserveCapacity(256) + for (b, ch) in byteToUnicode.enumerated() { + let scalar = ch.unicodeScalars.first!.value + dict[scalar] = UInt8(b) + } + return dict + }() + + /// Encode a UTF-8 byte sequence as a string of mapped characters. + public static func encode(_ bytes: some Sequence) -> String { + var out = "" + for b in bytes { + out.append(byteToUnicode[Int(b)]) + } + return out + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift new file mode 100644 index 000000000..e54f14fa5 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift @@ -0,0 +1,175 @@ +import Foundation + +/// RAS (Repetition-Aware Sampling) — top-p nucleus sampling with a repetition +/// mask that re-samples if a token fires too often in the recent window. +/// +/// Mirrors `ras_sampling` in +/// `mobius/.../verify/test_coreml_e2e_fp16.py`: +/// 1. softmax(logp) → stable-sort desc → pick up to `topK` ids until +/// cumulative mass ≥ `topP` +/// 2. multinomial draw within that candidate set +/// 3. if the drawn id appears in the last `winSize` decoded tokens at least +/// `winSize * tauR` times, mask it to -inf and re-sample across the full +/// vocab +/// +/// A `seedTokens` mode bypasses the RNG entirely — the sampler just emits the +/// pre-recorded Python token stream one id at a time. This is how the parity +/// harness bit-matches despite the `torch.multinomial` RNG mismatch between +/// PyTorch and Swift. +public final class CosyVoice3RasSampler: @unchecked Sendable { + + public let topP: Float + public let topK: Int + public let winSize: Int + public let tauR: Float + public let vocabSize: Int + + private var rng: SeedableRng + private var seedQueue: [Int32] + private var seedIdx: Int = 0 + + public init( + topP: Float = CosyVoice3Constants.topP, + topK: Int = CosyVoice3Constants.topK, + winSize: Int = CosyVoice3Constants.rasWindow, + tauR: Float = CosyVoice3Constants.rasTauR, + vocabSize: Int = CosyVoice3Constants.speechVocab, + seed: UInt64 = 42 + ) { + self.topP = topP + self.topK = topK + self.winSize = winSize + self.tauR = tauR + self.vocabSize = vocabSize + self.rng = SeedableRng(seed: seed) + self.seedQueue = [] + } + + /// Pre-load a token stream to replay (for parity harness). + public func seedTokens(_ tokens: [Int32]) { + self.seedQueue = tokens + self.seedIdx = 0 + } + + /// Given `logits` of shape `[vocabSize]`, return the sampled token id. + /// `decodedSoFar` is the running decoded stream for repetition checking. + public func sample(logits: [Float], decodedSoFar: [Int32]) -> Int32 { + // Seeded parity replay bypasses sampling. + if seedIdx < seedQueue.count { + let id = seedQueue[seedIdx] + seedIdx += 1 + return id + } + precondition(logits.count == vocabSize, "logits count must match vocabSize") + + // Pass 1: nucleus sampling. + let probs = logits.softmax() + let top = nucleus(probs: probs) + var sampled = top + + // Pass 2: repetition mask. + let windowStart = max(0, decodedSoFar.count - winSize) + let recent = decodedSoFar[windowStart..= Float(winSize) * tauR { + var masked = probs + masked[Int(sampled)] = 0 + // Re-normalize + multinomial across full vocab. + let sum = masked.reduce(0, +) + if sum > 0 { + for i in 0.. Int32 { + // Stable sort descending with index. + let sorted = probs.enumerated().sorted { + if $0.element != $1.element { return $0.element > $1.element } + return $0.offset < $1.offset + } + var cum: Float = 0 + var selIdx: [Int] = [] + var selProb: [Float] = [] + for entry in sorted { + if cum < topP && selProb.count < topK { + cum += entry.element + selProb.append(entry.element) + selIdx.append(entry.offset) + } else { + break + } + } + // Normalize selected candidates and multinomial pick. + let sum = selProb.reduce(0, +) + guard sum > 0 else { return Int32(selIdx.first ?? 0) } + for i in 0.. Int32 { + let u = rng.nextFloat() + var cum: Float = 0 + for (i, p) in probs.enumerated() { + cum += p + if u < cum { return Int32(i) } + } + return Int32(probs.count - 1) + } + + private func multinomialInSet(probs: [Float], ids: [Int]) -> Int { + let u = rng.nextFloat() + var cum: Float = 0 + for (j, p) in probs.enumerated() { + cum += p + if u < cum { return ids[j] } + } + return ids.last ?? 0 + } +} + +// MARK: - Simple deterministic RNG + +/// Linear-congruential PRNG wrapping SplitMix64. Used only as a fallback when +/// parity replay isn't active; the parity harness seeds an explicit token list +/// to dodge `torch.multinomial` divergence. +private struct SeedableRng { + private var state: UInt64 + init(seed: UInt64) { self.state = seed == 0 ? 0xdead_beef : seed } + mutating func nextUInt64() -> UInt64 { + state &+= 0x9E37_79B9_7F4A_7C15 + var z = state + z = (z ^ (z >> 30)) &* 0xBF58_476D_1CE4_E5B9 + z = (z ^ (z >> 27)) &* 0x94D0_49BB_1331_11EB + return z ^ (z >> 31) + } + mutating func nextFloat() -> Float { + // 24-bit mantissa → [0, 1) + let bits = UInt32(truncatingIfNeeded: nextUInt64() >> 40) + return Float(bits) / Float(1 << 24) + } +} + +// MARK: - Array softmax + +extension Array where Element == Float { + fileprivate func softmax() -> [Float] { + guard let m = self.max() else { return self } + var exps = [Float](repeating: 0, count: self.count) + var sum: Float = 0 + for i in 0.. 0 { + for i in 0.. MLMultiArray { + guard tokenId >= 0 && Int(tokenId) < numTokens else { + throw CosyVoice3Error.invalidShape( + "speech token id \(tokenId) out of range [0, \(numTokens))") + } + let array = try MLMultiArray( + shape: [1, 1, NSNumber(value: embedDim)], + dataType: .float32) + let rowStart = Int(tokenId) * rowByteSize + let dim = embedDim + let lastStride = array.strides.last?.intValue ?? 1 + tableBytes.withUnsafeBytes { src in + let basePtr = src.baseAddress!.advanced(by: rowStart) + let fp16Ptr = basePtr.assumingMemoryBound(to: Float16.self) + let dstPtr = array.dataPointer.bindMemory(to: Float.self, capacity: array.count) + for i in 0.. CosyVoice3SynthesisResult { + + let nPrompt = fixture.promptSpeechIds.count + let roomForNew = CosyVoice3Constants.flowTotalTokens - nPrompt + guard roomForNew > 0 else { + throw CosyVoice3Error.sequenceTooLong(nPrompt) + } + let maxNew: Int = { + if let cap = options.maxNewTokens, cap > 0 { return min(cap, roomForNew) } + return roomForNew + }() + + // Sampler. Parity harness seeds the Python-recorded decode stream. + let sampler = CosyVoice3RasSampler(seed: options.seed) + if options.replayDecodedTokens { + sampler.seedTokens(fixture.decodedTokens) + } + + // 1) Prefill + let (prefillLogits, initialKvK, initialKvV) = try await runPrefill(fixture: fixture) + var kvK = initialKvK + var kvV = initialKvV + + // First token from prefill tail logits. + var decoded: [Int32] = [] + let firstLogits = sliceLastStepLogits( + from: prefillLogits, + tPre: fixture.tPre, + vocab: CosyVoice3Constants.speechVocab) + var topId = sampler.sample(logits: firstLogits, decodedSoFar: decoded) + if CosyVoice3Constants.stopRange.contains(topId) { + logger.info("First token \(topId) is a stop token; no speech generated") + } else { + decoded.append(topId) + } + + // 2) Decode loop + var curLen = fixture.tPre + for step in 1.. (logits: MLMultiArray, kvK: MLMultiArray, kvV: MLMultiArray) { + guard fixture.tPre <= CosyVoice3Constants.prefillLength else { + throw CosyVoice3Error.prefillTooLong(fixture.tPre) + } + // Pad lm_input_embeds from [1, tPre, 896] to [1, 256, 896]. + // Strides may be non-compact (e.g. [T*D_padded, D_padded, 1]). + let embeds = try MLMultiArray( + shape: [ + 1, + NSNumber(value: CosyVoice3Constants.prefillLength), + NSNumber(value: CosyVoice3Constants.embedDim), + ], + dataType: .float32) + let embedDim = CosyVoice3Constants.embedDim + let embedsStrides = embeds.strides.map { $0.intValue } + let dst = embeds.dataPointer.bindMemory(to: Float.self, capacity: embeds.count) + let physicalCount = embedsStrides[0] * embeds.shape[0].intValue + dst.initialize(repeating: 0, count: physicalCount) + for t in 0.. (logits: [Float], kvK: MLMultiArray, kvV: MLMultiArray) { + let curLenArr = try MLMultiArray(shape: [1], dataType: .int32) + curLenArr[0] = NSNumber(value: curLen) + + let features: [String: Any] = [ + "inputs_embeds": inputsEmbeds, + "kv_k": kvK, + "kv_v": kvV, + "cur_len": curLenArr, + ] + let provider = try MLDictionaryFeatureProvider(dictionary: features) + let output = try await models.decode.compatPrediction( + from: provider, options: MLPredictionOptions()) + + guard + let logitsArr = output.featureValue(for: "speech_logits")?.multiArrayValue, + let newKvK = output.featureValue(for: "kv_k_out")?.multiArrayValue, + let newKvV = output.featureValue(for: "kv_v_out")?.multiArrayValue + else { + throw CosyVoice3Error.predictionFailed("decode: missing outputs") + } + // logits shape = [1, 1, 6761] fp32; strides may be non-compact. + let count = CosyVoice3Constants.speechVocab + var logits = [Float](repeating: 0, count: count) + let strides = logitsArr.strides.map { $0.intValue } + let vocabStride = strides.last ?? 1 + let base = logitsArr.dataPointer.bindMemory(to: Float.self, capacity: logitsArr.count) + for i in 0.. (mel: MLMultiArray, numPromptMel: Int) { + let N = CosyVoice3Constants.flowTotalTokens + let nPrompt = promptSpeechIds.count + let nNew = decodedTokens.count + let nTotal = nPrompt + nNew + guard nTotal <= N else { + throw CosyVoice3Error.sequenceTooLong(nTotal) + } + // token_total: [1, 250] int32, zero-padded. Respect strides. + let tokenTotal = try MLMultiArray( + shape: [1, NSNumber(value: N)], + dataType: .int32) + let ttStrides = tokenTotal.strides.map { $0.intValue } + let ttPtr = tokenTotal.dataPointer.bindMemory(to: Int32.self, capacity: tokenTotal.count) + let ttPhysical = ttStrides[0] * tokenTotal.shape[0].intValue + ttPtr.initialize(repeating: 0, count: ttPhysical) + for i in 0.. [Float] { + // fullMel logical shape = [1, 80, 500] fp32. Physical strides may be + // non-compact (e.g. [40960, 512, 1]) — use logical indexing. + let hiftFrames = CosyVoice3Constants.hiftMaxFrames + let melBins = CosyVoice3Constants.melBins + let validFrames = min(newMelFrames, hiftFrames) + + let melInput = try MLMultiArray( + shape: [1, NSNumber(value: melBins), NSNumber(value: hiftFrames)], + dataType: .float32) + // melInput strides may also be non-compact — use logical indexing. + let melInputStrides = melInput.strides.map { $0.intValue } + let dstBase = melInput.dataPointer.bindMemory(to: Float.self, capacity: melInput.count) + // Zero-fill entire physical extent (handles padded strides). + let totalPhysical = melInputStrides[0] * melInput.shape[0].intValue + dstBase.initialize(repeating: 0, count: totalPhysical) + + let srcStrides = fullMel.strides.map { $0.intValue } + let srcBase = fullMel.dataPointer.bindMemory(to: Float.self, capacity: fullMel.count) + // fullMel logical: [1, 80, 500]; copy new slice → melInput [1, 80, 500]. + for b in 0.. [Float] { + let strides = logits.strides.map { $0.intValue } + // shape = [1, T, V]; row (time) stride is strides[1], vocab stride is strides[2]. + let rowStride = strides[1] + let vocabStride = strides[2] + let ptr = logits.dataPointer.bindMemory(to: Float.self, capacity: logits.count) + let base = (tPre - 1) * rowStride + var out = [Float](repeating: 0, count: vocab) + for i in 0..": {"dtype": "...", "shape": [...], "data_offsets": [start, end]}, ... }` +/// - raw tensor payload (referenced by offsets above) +/// +/// Used for Phase 1 fixture + speech embedding table mmap. +public final class SafetensorsFile: @unchecked Sendable { + + public enum DType: String, Sendable { + case f16 = "F16" + case bf16 = "BF16" + case f32 = "F32" + case f64 = "F64" + case i8 = "I8" + case i16 = "I16" + case i32 = "I32" + case i64 = "I64" + case u8 = "U8" + case u16 = "U16" + case u32 = "U32" + case u64 = "U64" + case bool = "BOOL" + + public var byteSize: Int { + switch self { + case .f16, .bf16, .i16, .u16: return 2 + case .f32, .i32, .u32: return 4 + case .f64, .i64, .u64: return 8 + case .i8, .u8, .bool: return 1 + } + } + } + + public struct TensorInfo: Sendable { + public let dtype: DType + public let shape: [Int] + public let dataStart: Int // absolute offset in file + public let dataEnd: Int + public var byteCount: Int { dataEnd - dataStart } + } + + private let data: Data + private let payloadStart: Int + public let tensors: [String: TensorInfo] + + public init(url: URL) throws { + let data = try Data(contentsOf: url, options: [.alwaysMapped]) + guard data.count >= 8 else { + throw CosyVoice3Error.invalidSafetensors("file smaller than 8 byte header: \(url.path)") + } + self.data = data + + let headerLen: UInt64 = data.withUnsafeBytes { buf in + var v: UInt64 = 0 + memcpy(&v, buf.baseAddress!, 8) + return UInt64(littleEndian: v) + } + let headerEnd = 8 + Int(headerLen) + guard headerEnd <= data.count else { + throw CosyVoice3Error.invalidSafetensors( + "header length \(headerLen) exceeds file size \(data.count)") + } + let headerData = data.subdata(in: 8.. Data { + guard let info = tensors[name] else { + throw CosyVoice3Error.invalidSafetensors("tensor not found: \(name)") + } + return data.subdata(in: info.dataStart.. TensorInfo { + guard let info = tensors[name] else { + throw CosyVoice3Error.invalidSafetensors("tensor not found: \(name)") + } + return info + } + + // MARK: - Typed accessors (copying) + + public func asFloat32(_ name: String) throws -> [Float] { + let info = try self.info(name) + let bytes = try rawBytes(name) + switch info.dtype { + case .f32: + return bytes.withUnsafeBytes { buf -> [Float] in + let count = buf.count / 4 + let ptr = buf.bindMemory(to: Float.self) + return Array(UnsafeBufferPointer(start: ptr.baseAddress, count: count)) + } + case .f64: + return bytes.withUnsafeBytes { buf -> [Float] in + let count = buf.count / 8 + let ptr = buf.bindMemory(to: Double.self) + return (0.. [Int32] { + let info = try self.info(name) + let bytes = try rawBytes(name) + switch info.dtype { + case .i32: + return bytes.withUnsafeBytes { buf -> [Int32] in + let count = buf.count / 4 + let ptr = buf.bindMemory(to: Int32.self) + return Array(UnsafeBufferPointer(start: ptr.baseAddress, count: count)) + } + case .i64: + return bytes.withUnsafeBytes { buf -> [Int32] in + let count = buf.count / 8 + let ptr = buf.bindMemory(to: Int64.self) + return (0.. Int { + let values = try asInt32(name) + guard let first = values.first else { + throw CosyVoice3Error.invalidSafetensors("tensor \(name) is empty") + } + return Int(first) + } +} diff --git a/Sources/FluidAudio/TTS/TtsBackend.swift b/Sources/FluidAudio/TTS/TtsBackend.swift index e230bc4cc..dbeb94246 100644 --- a/Sources/FluidAudio/TTS/TtsBackend.swift +++ b/Sources/FluidAudio/TTS/TtsBackend.swift @@ -6,4 +6,6 @@ public enum TtsBackend: Sendable { case kokoro /// PocketTTS — flow-matching language model, autoregressive streaming synthesis. case pocketTts + /// CosyVoice3 — Mandarin zero-shot voice cloning via Qwen2 LM + Flow CFM + HiFT. + case cosyvoice3 } diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3FrontendParityCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3FrontendParityCommand.swift new file mode 100644 index 000000000..70520a561 --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/CosyVoice3FrontendParityCommand.swift @@ -0,0 +1,146 @@ +import CoreML +import FluidAudio +import Foundation + +/// Phase 2 text-frontend parity harness. +/// +/// Loads `shipping.safetensors` (expected `lm_input_embeds`, `llm_prompt_speech_ids`) +/// plus its JSON sidecar (`prompt_text`, `tts_text`), tokenizes the text via +/// `Qwen2BpeTokenizer`, assembles via `CosyVoice3TextFrontend`, and compares +/// element-wise against the fixture. +/// +/// Usage: +/// ``` +/// fluidaudio tts --backend cosyvoice3-frontend-parity \ +/// --tokenizer-dir .../cosyvoice3_dl/CosyVoice-BlankEN \ +/// --embeddings-file .../build/embeddings/embeddings-fp32.safetensors \ +/// --fixture .../build/frontend/shipping.safetensors \ +/// --tok-fixture .../build/frontend/tokenizer_fixture.json +/// ``` +enum CosyVoice3FrontendParityCLI { + + private static let logger = AppLogger(category: "CosyVoice3FrontendParityCLI") + + static func run( + tokenizerDir: String, + embeddingsFile: String, + fixturePath: String, + tokFixturePath: String + ) async { + let tokURL = URL( + fileURLWithPath: (tokenizerDir as NSString).expandingTildeInPath, + isDirectory: true) + let embURL = URL(fileURLWithPath: (embeddingsFile as NSString).expandingTildeInPath) + let fixURL = URL(fileURLWithPath: (fixturePath as NSString).expandingTildeInPath) + let tokFixURL = URL(fileURLWithPath: (tokFixturePath as NSString).expandingTildeInPath) + let sidecarURL = fixURL.deletingPathExtension().appendingPathExtension("json") + + struct TokFix: Decodable { + let special_tokens: [String: Int32] + } + struct Sidecar: Decodable { + let prompt_text: String + let tts_text: String + } + + do { + let tokFix = try JSONDecoder().decode( + TokFix.self, from: try Data(contentsOf: tokFixURL)) + let sidecar = try JSONDecoder().decode( + Sidecar.self, from: try Data(contentsOf: sidecarURL)) + + let tStart = Date() + let tokenizer = try Qwen2BpeTokenizer.load( + directory: tokURL, specialTokens: tokFix.special_tokens) + let embeddings = try CosyVoice3TextEmbeddings(url: embURL) + logger.info( + "Loaded tokenizer + text_embedding table in \(String(format: "%.2fs", Date().timeIntervalSince(tStart)))" + ) + + let fixture = try CosyVoice3FrontendFixture.load(from: fixURL) + logger.info("Fixture: T_pre=\(fixture.tPre) N_prompt_speech=\(fixture.promptSpeechIds.count)") + + let frontend = CosyVoice3TextFrontend(tokenizer: tokenizer, embeddings: embeddings) + let assembled = try frontend.assemble( + promptText: sidecar.prompt_text, + ttsText: sidecar.tts_text, + promptSpeechIds: fixture.promptSpeechIds) + + print("") + print(" swift T_pre : \(assembled.tPre)") + print(" fixture T_pre : \(fixture.tPre)") + + guard assembled.tPre == fixture.tPre else { + print("T_pre mismatch — tokenization diverged.") + exit(1) + } + + // Element-wise comparison: fixture is compact fp32, swift array + // may have padded strides. + let dim = CosyVoice3Constants.embedDim + let strides = assembled.lmInputEmbeds.strides.map { $0.intValue } + let ptr = assembled.lmInputEmbeds.dataPointer.bindMemory( + to: Float.self, capacity: assembled.lmInputEmbeds.count) + var maxAbs: Double = 0 + var maxAt: (t: Int, d: Int) = (0, 0) + var sumAbs: Double = 0 + var rowMax = [Double](repeating: 0, count: assembled.tPre) + let n = assembled.tPre * dim + for t in 0.. rowMax[t] { rowMax[t] = a } + if a > maxAbs { + maxAbs = a + maxAt = (t, d) + } + } + } + let mae = sumAbs / Double(n) + print(" MAE : \(String(format: "%.6e", mae))") + print(" max|Δ| : \(String(format: "%.6e", maxAbs)) at (t=\(maxAt.t), d=\(maxAt.d))") + + // Show the top-5 worst rows to see if divergence is concentrated + // at sos (t=0), task_id (t=1+nText), or specific text/speech rows. + let N_speech = fixture.promptSpeechIds.count + let nText = assembled.tPre - 2 - N_speech + print( + " layout : sos@0 text@1..\(nText) task@\(1 + nText) speech@\(2 + nText)..\(assembled.tPre - 1)" + ) + let ranked = rowMax.enumerated().sorted { $0.element > $1.element }.prefix(5) + print(" top rows:") + for (t, m) in ranked { + let slot: String + if t == 0 { + slot = "sos" + } else if t == 1 + nText { + slot = "task_id" + } else if t < 1 + nText { + slot = "text[\(t - 1)]" + } else { + slot = "speech[\(t - 2 - nText)]" + } + print( + " t=\(t) \(slot.padding(toLength: 12, withPad: " ", startingAt: 0)) max|Δ|=\(String(format: "%.6e", m))" + ) + } + + // Compare Swift's reconstructed token ids for sanity. + print(" swift textToken ids (first 10): \(assembled.textTokenIds.prefix(10).map { $0 })") + print(" swift textToken ids (last 5) : \(assembled.textTokenIds.suffix(5).map { $0 })") + + if maxAbs > 1e-4 { + print("parity tolerance exceeded (max|Δ| > 1e-4)") + exit(1) + } + print("frontend parity OK") + } catch { + logger.error("Frontend parity failed: \(error)") + exit(2) + } + } +} diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3ParityCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3ParityCommand.swift new file mode 100644 index 000000000..1b017f123 --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/CosyVoice3ParityCommand.swift @@ -0,0 +1,195 @@ +import CoreML +import FluidAudio +import Foundation + +/// Phase 1 parity harness CLI for the CosyVoice3 Swift port. +/// +/// Usage: +/// ``` +/// fluidaudio tts --backend cosyvoice3-parity \ +/// --fixture .../build/frontend/shipping.safetensors \ +/// --models-dir .../coreml/build \ +/// --reference .../build/wavs/e2e_shipping.wav \ +/// --output .../build/swift_e2e.wav \ +/// --seed 42 +/// ``` +enum CosyVoice3ParityCLI { + + private static let logger = AppLogger(category: "CosyVoice3ParityCLI") + + static func run( + fixturePath: String, + modelsDir: String, + referencePath: String?, + outputPath: String, + seed: UInt64, + cpuOnly: Bool, + replayTokens: Bool + ) async { + let fixtureURL = URL(fileURLWithPath: (fixturePath as NSString).expandingTildeInPath) + let modelsURL = URL( + fileURLWithPath: (modelsDir as NSString).expandingTildeInPath, isDirectory: true) + let outputURL = URL(fileURLWithPath: (outputPath as NSString).expandingTildeInPath) + + let computeUnits: MLComputeUnits = cpuOnly ? .cpuOnly : .cpuAndNeuralEngine + let manager = CosyVoice3TtsManager(directory: modelsURL, computeUnits: computeUnits) + + do { + let tLoad = Date() + try await manager.initialize() + logger.info( + "Loaded CosyVoice3 models in \(String(format: "%.2f", Date().timeIntervalSince(tLoad)))s" + ) + + let options = CosyVoice3ParityOptions( + maxNewTokens: nil, seed: seed, replayDecodedTokens: replayTokens) + + let tSynth = Date() + let result = try await manager.synthesizeFromFixture( + fixtureURL: fixtureURL, options: options) + let synthElapsed = Date().timeIntervalSince(tSynth) + logger.info( + "Synthesized \(result.samples.count) samples (\(String(format: "%.2fs", Double(result.samples.count) / Double(result.sampleRate)))) in \(String(format: "%.2fs", synthElapsed))" + ) + + try writeWAV(samples: result.samples, sampleRate: result.sampleRate, to: outputURL) + logger.info("Wrote WAV: \(outputURL.path)") + + if let refPath = referencePath { + let refURL = URL( + fileURLWithPath: (refPath as NSString).expandingTildeInPath) + let refSamples = try readWAVMono(url: refURL) + let metrics = compareWaveforms( + swift: result.samples, reference: refSamples) + print("") + print( + " reference samples : \(refSamples.count) swift samples : \(result.samples.count)" + ) + print( + " MAE : \(String(format: "%.6f", metrics.mae))") + print( + " max|Δ| : \(String(format: "%.6f", metrics.maxAbsDiff))") + print(" SNR : \(String(format: "%.2f dB", metrics.snrDb))") + if metrics.maxAbsDiff > 1e-3 { + logger.warning( + "Parity tolerance exceeded: max|Δ|=\(metrics.maxAbsDiff) > 1e-3") + exit(1) + } + } + } catch { + logger.error("CosyVoice3 parity harness failed: \(error)") + exit(2) + } + } + + // MARK: - WAV IO (un-normalized) + + private static func writeWAV(samples: [Float], sampleRate: Int, to url: URL) throws { + // Clamp to [-1, 1] to avoid int16 overflow; do NOT rescale to max=1. + let numSamples = samples.count + let byteRate = sampleRate * 2 + let dataSize = numSamples * 2 + var header = Data() + header.append("RIFF".data(using: .ascii)!) + header.appendUInt32LE(UInt32(36 + dataSize)) + header.append("WAVE".data(using: .ascii)!) + header.append("fmt ".data(using: .ascii)!) + header.appendUInt32LE(16) + header.appendUInt16LE(1) // PCM + header.appendUInt16LE(1) // mono + header.appendUInt32LE(UInt32(sampleRate)) + header.appendUInt32LE(UInt32(byteRate)) + header.appendUInt16LE(2) // block align + header.appendUInt16LE(16) // bits/sample + header.append("data".data(using: .ascii)!) + header.appendUInt32LE(UInt32(dataSize)) + + var pcm = Data(capacity: dataSize) + for s in samples { + let clipped = max(-1.0, min(1.0, s)) + let i16 = Int16(clipped * 32_767.0) + var le = i16.littleEndian + Swift.withUnsafeBytes(of: &le) { pcm.append(contentsOf: $0) } + } + try (header + pcm).write(to: url) + } + + private static func readWAVMono(url: URL) throws -> [Float] { + let data = try Data(contentsOf: url) + guard data.count > 44 else { + throw CocoaError(.fileReadCorruptFile) + } + // Find 'data' chunk. + var offset = 12 + var dataStart = -1 + var dataSize = 0 + while offset + 8 <= data.count { + let id = data.subdata(in: offset.. 0 else { throw CocoaError(.fileReadCorruptFile) } + let pcm = data.subdata(in: dataStart.. WaveformMetrics { + let n = min(swift.count, reference.count) + guard n > 0 else { return WaveformMetrics(mae: .infinity, maxAbsDiff: .infinity, snrDb: -.infinity) } + var sumAbs: Double = 0 + var maxAbs: Double = 0 + var sumSigSq: Double = 0 + var sumErrSq: Double = 0 + for i in 0.. maxAbs { maxAbs = a } + sumSigSq += Double(reference[i]) * Double(reference[i]) + sumErrSq += diff * diff + } + let snr = sumErrSq > 0 ? 10 * log10(sumSigSq / sumErrSq) : .infinity + return WaveformMetrics(mae: sumAbs / Double(n), maxAbsDiff: maxAbs, snrDb: snr) + } +} + +// MARK: - Data helpers + +extension Data { + fileprivate mutating func appendUInt32LE(_ v: UInt32) { + var le = v.littleEndian + Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) } + } + fileprivate mutating func appendUInt16LE(_ v: UInt16) { + var le = v.littleEndian + Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) } + } + fileprivate func readUInt32LE() -> UInt32 { + self.withUnsafeBytes { buf -> UInt32 in + var v: UInt32 = 0 + memcpy(&v, buf.baseAddress!, 4) + return UInt32(littleEndian: v) + } + } +} diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3TextCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3TextCommand.swift new file mode 100644 index 000000000..c4389ad52 --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/CosyVoice3TextCommand.swift @@ -0,0 +1,135 @@ +import CoreML +import FluidAudio +import Foundation + +/// Phase 2 text-driven synthesis CLI for the CosyVoice3 Swift port. +/// +/// Drives `CosyVoice3TtsManager.synthesize(text:promptAssets:options:)` end +/// to end: tokenizer + frontend + LLM + Flow + HiFT, writing a 24 kHz WAV. +/// +/// Usage: +/// ``` +/// fluidaudio tts --backend cosyvoice3-text \ +/// --text "希望你以后能够做的比我还好用" \ +/// --models-dir .../coreml/build \ +/// --tokenizer-dir .../cosyvoice3_dl/CosyVoice-BlankEN \ +/// --embeddings-file .../build/embeddings/embeddings-runtime-fp32.safetensors \ +/// --special-tokens-file .../build/frontend/tokenizer_fixture.json \ +/// --prompt-assets .../build/frontend/shipping.safetensors \ +/// --output .../build/swift_cv3_text.wav \ +/// --seed 42 +/// ``` +enum CosyVoice3TextCLI { + + private static let logger = AppLogger(category: "CosyVoice3TextCLI") + + static func run( + text: String, + modelsDir: String, + tokenizerDir: String, + embeddingsFile: String, + specialTokensFile: String, + promptAssetsPath: String, + outputPath: String, + seed: UInt64, + maxNewTokens: Int?, + cpuOnly: Bool + ) async { + let modelsURL = URL( + fileURLWithPath: (modelsDir as NSString).expandingTildeInPath, isDirectory: true) + let tokURL = URL( + fileURLWithPath: (tokenizerDir as NSString).expandingTildeInPath, isDirectory: true) + let embURL = URL(fileURLWithPath: (embeddingsFile as NSString).expandingTildeInPath) + let specURL = URL(fileURLWithPath: (specialTokensFile as NSString).expandingTildeInPath) + let promptURL = URL(fileURLWithPath: (promptAssetsPath as NSString).expandingTildeInPath) + let outputURL = URL(fileURLWithPath: (outputPath as NSString).expandingTildeInPath) + + let computeUnits: MLComputeUnits = cpuOnly ? .cpuOnly : .cpuAndNeuralEngine + let manager = CosyVoice3TtsManager( + modelsDirectory: modelsURL, + tokenizerDirectory: tokURL, + textEmbeddingsFile: embURL, + specialTokensFile: specURL, + computeUnits: computeUnits) + + do { + let tLoad = Date() + try await manager.initialize() + logger.info( + "Loaded CosyVoice3 models + frontend in \(String(format: "%.2f", Date().timeIntervalSince(tLoad)))s" + ) + + let tPrompt = Date() + let promptAssets = try CosyVoice3PromptAssets.load(from: promptURL) + logger.info( + "Loaded prompt assets in \(String(format: "%.2f", Date().timeIntervalSince(tPrompt)))s — N_speech=\(promptAssets.promptSpeechIds.count), mel_frames=\(promptAssets.promptMelFrames)" + ) + + let options = CosyVoice3SynthesisOptions( + maxNewTokens: maxNewTokens, seed: seed) + + let tSynth = Date() + let result = try await manager.synthesize( + text: text, promptAssets: promptAssets, options: options) + let synthElapsed = Date().timeIntervalSince(tSynth) + let audioSecs = Double(result.samples.count) / Double(result.sampleRate) + let rtfx = synthElapsed > 0 ? audioSecs / synthElapsed : 0 + logger.info( + "Synthesized \(result.samples.count) samples (\(String(format: "%.2fs", audioSecs))) in \(String(format: "%.2fs", synthElapsed)) — RTFx \(String(format: "%.2fx", rtfx))" + ) + logger.info("Generated \(result.generatedTokenCount) speech tokens") + + try FileManager.default.createDirectory( + at: outputURL.deletingLastPathComponent(), + withIntermediateDirectories: true) + try writeWAV(samples: result.samples, sampleRate: result.sampleRate, to: outputURL) + logger.info("Wrote WAV: \(outputURL.path)") + } catch { + logger.error("CosyVoice3 text synthesis failed: \(error)") + exit(2) + } + } + + private static func writeWAV(samples: [Float], sampleRate: Int, to url: URL) throws { + let numSamples = samples.count + let byteRate = sampleRate * 2 + let dataSize = numSamples * 2 + var header = Data() + header.append("RIFF".data(using: .ascii)!) + header.appendUInt32LE(UInt32(36 + dataSize)) + header.append("WAVE".data(using: .ascii)!) + header.append("fmt ".data(using: .ascii)!) + header.appendUInt32LE(16) + header.appendUInt16LE(1) // PCM + header.appendUInt16LE(1) // mono + header.appendUInt32LE(UInt32(sampleRate)) + header.appendUInt32LE(UInt32(byteRate)) + header.appendUInt16LE(2) // block align + header.appendUInt16LE(16) // bits/sample + header.append("data".data(using: .ascii)!) + header.appendUInt32LE(UInt32(dataSize)) + + var pcm = Data(capacity: dataSize) + for s in samples { + let clipped = max(-1.0, min(1.0, s)) + let i16 = Int16(clipped * 32_767.0) + var le = i16.littleEndian + Swift.withUnsafeBytes(of: &le) { pcm.append(contentsOf: $0) } + } + try (header + pcm).write(to: url) + } +} + +// MARK: - Data helpers (file-scoped duplicate of the helpers in +// CosyVoice3ParityCommand.swift; kept here so this file compiles on its own). + +extension Data { + fileprivate mutating func appendUInt32LE(_ v: UInt32) { + var le = v.littleEndian + Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) } + } + fileprivate mutating func appendUInt16LE(_ v: UInt16) { + var le = v.littleEndian + Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) } + } +} diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3TokenizerParityCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3TokenizerParityCommand.swift new file mode 100644 index 000000000..d5550c60c --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/CosyVoice3TokenizerParityCommand.swift @@ -0,0 +1,70 @@ +import FluidAudio +import Foundation + +/// Phase 2 tokenizer parity harness. +/// +/// Loads the Python-exported tokenizer_fixture.json (special token map + test +/// cases) and asserts the Swift Qwen2BpeTokenizer produces the same ID stream +/// for every case. +/// +/// Usage: +/// ``` +/// fluidaudio tts --backend cosyvoice3-tokenizer-parity \ +/// --tokenizer-dir .../cosyvoice3_dl/CosyVoice-BlankEN \ +/// --fixture .../build/frontend/tokenizer_fixture.json +/// ``` +enum CosyVoice3TokenizerParityCLI { + + private static let logger = AppLogger(category: "CosyVoice3TokenizerParityCLI") + + static func run(tokenizerDir: String, fixturePath: String) async { + let tokURL = URL( + fileURLWithPath: (tokenizerDir as NSString).expandingTildeInPath, + isDirectory: true) + let fixURL = URL(fileURLWithPath: (fixturePath as NSString).expandingTildeInPath) + + struct Fixture: Decodable { + let special_tokens: [String: Int32] + let cases: [Case] + struct Case: Decodable { + let text: String + let ids: [Int32] + } + } + + do { + let data = try Data(contentsOf: fixURL) + let fixture = try JSONDecoder().decode(Fixture.self, from: data) + let tokenizer = try Qwen2BpeTokenizer.load( + directory: tokURL, specialTokens: fixture.special_tokens) + + var passed = 0 + var failed = 0 + var firstFail: (String, [Int32], [Int32])? = nil + for tc in fixture.cases { + let got = tokenizer.encode(tc.text) + if got == tc.ids { + passed += 1 + } else { + failed += 1 + if firstFail == nil { + firstFail = (tc.text, tc.ids, got) + } + } + } + + print("cases: \(passed + failed) passed: \(passed) failed: \(failed)") + if let (text, expected, got) = firstFail { + print("") + print("first mismatch:") + print(" text : \(text.debugDescription)") + print(" expected : \(expected)") + print(" got : \(got)") + } + if failed > 0 { exit(1) } + } catch { + logger.error("Tokenizer parity failed: \(error)") + exit(2) + } + } +} diff --git a/Sources/FluidAudioCLI/Commands/TTSCommand.swift b/Sources/FluidAudioCLI/Commands/TTSCommand.swift index 2037a94d2..06d291f06 100644 --- a/Sources/FluidAudioCLI/Commands/TTSCommand.swift +++ b/Sources/FluidAudioCLI/Commands/TTSCommand.swift @@ -146,6 +146,25 @@ public struct TTS { var cloneVoicePath: String? = nil var voiceFilePath: String? = nil var saveVoicePath: String? = nil + // CosyVoice3 Phase 1 parity harness args. + var cv3FixturePath: String? = nil + var cv3ModelsDir: String? = nil + var cv3ReferencePath: String? = nil + var cv3Seed: UInt64 = 42 + var cv3CpuOnly: Bool = false + var cv3ReplayTokens: Bool = true + // CosyVoice3 Phase 2 tokenizer parity args. + var cv3TokenizerDir: String? = nil + var cv3TokenizerParityMode: Bool = false + // CosyVoice3 Phase 2 frontend parity args. + var cv3FrontendParityMode: Bool = false + var cv3EmbeddingsFile: String? = nil + var cv3TokFixturePath: String? = nil + // CosyVoice3 Phase 2 text-driven synthesis args. + var cv3TextMode: Bool = false + var cv3SpecialTokensFile: String? = nil + var cv3PromptAssetsPath: String? = nil + var cv3MaxNewTokens: Int? = nil var i = 0 while i < arguments.count { @@ -200,11 +219,81 @@ public struct TTS { backend = .kokoro case "pocket", "pockettts": backend = .pocketTts + case "cosyvoice3", "cosyvoice3-parity", "cv3": + backend = .cosyvoice3 + case "cosyvoice3-tokenizer-parity", "cv3-tokenizer": + backend = .cosyvoice3 + cv3TokenizerParityMode = true + case "cosyvoice3-frontend-parity", "cv3-frontend": + backend = .cosyvoice3 + cv3FrontendParityMode = true + case "cosyvoice3-text", "cv3-text": + backend = .cosyvoice3 + cv3TextMode = true default: logger.warning("Unknown backend '\(arguments[i + 1])'; using kokoro") } i += 1 } + case "--fixture": + if i + 1 < arguments.count { + cv3FixturePath = arguments[i + 1] + i += 1 + } + case "--models-dir": + if i + 1 < arguments.count { + cv3ModelsDir = arguments[i + 1] + i += 1 + } + case "--reference": + if i + 1 < arguments.count { + cv3ReferencePath = arguments[i + 1] + i += 1 + } + case "--seed": + if i + 1 < arguments.count { + cv3Seed = UInt64(arguments[i + 1]) ?? 42 + i += 1 + } + case "--cpu-only": + cv3CpuOnly = true + case "--no-replay": + cv3ReplayTokens = false + case "--tokenizer-dir": + if i + 1 < arguments.count { + cv3TokenizerDir = arguments[i + 1] + i += 1 + } + case "--embeddings-file": + if i + 1 < arguments.count { + cv3EmbeddingsFile = arguments[i + 1] + i += 1 + } + case "--tok-fixture": + if i + 1 < arguments.count { + cv3TokFixturePath = arguments[i + 1] + i += 1 + } + case "--special-tokens-file": + if i + 1 < arguments.count { + cv3SpecialTokensFile = arguments[i + 1] + i += 1 + } + case "--prompt-assets": + if i + 1 < arguments.count { + cv3PromptAssetsPath = arguments[i + 1] + i += 1 + } + case "--text": + if i + 1 < arguments.count { + text = arguments[i + 1] + i += 1 + } + case "--max-new-tokens": + if i + 1 < arguments.count { + cv3MaxNewTokens = Int(arguments[i + 1]) + i += 1 + } case "--auto-download": // No-op: downloads are always ensured by the CLI () @@ -249,6 +338,84 @@ public struct TTS { return } + if backend == .cosyvoice3 && cv3TokenizerParityMode { + guard let tokDir = cv3TokenizerDir, let fixture = cv3FixturePath else { + logger.error( + "cosyvoice3-tokenizer-parity requires --tokenizer-dir <.../CosyVoice-BlankEN> and --fixture " + ) + return + } + await CosyVoice3TokenizerParityCLI.run( + tokenizerDir: tokDir, fixturePath: fixture) + return + } + + if backend == .cosyvoice3 && cv3FrontendParityMode { + guard + let tokDir = cv3TokenizerDir, + let embFile = cv3EmbeddingsFile, + let fixture = cv3FixturePath, + let tokFix = cv3TokFixturePath + else { + logger.error( + "cosyvoice3-frontend-parity requires --tokenizer-dir, --embeddings-file, --fixture , --tok-fixture" + ) + return + } + await CosyVoice3FrontendParityCLI.run( + tokenizerDir: tokDir, + embeddingsFile: embFile, + fixturePath: fixture, + tokFixturePath: tokFix) + return + } + + if backend == .cosyvoice3 && cv3TextMode { + guard + let inputText = text, + let modelsDir = cv3ModelsDir, + let tokDir = cv3TokenizerDir, + let embFile = cv3EmbeddingsFile, + let specFile = cv3SpecialTokensFile, + let promptAssets = cv3PromptAssetsPath + else { + logger.error( + "cosyvoice3-text requires --text , --models-dir, --tokenizer-dir, --embeddings-file, --special-tokens-file, --prompt-assets" + ) + return + } + await CosyVoice3TextCLI.run( + text: inputText, + modelsDir: modelsDir, + tokenizerDir: tokDir, + embeddingsFile: embFile, + specialTokensFile: specFile, + promptAssetsPath: promptAssets, + outputPath: output, + seed: cv3Seed, + maxNewTokens: cv3MaxNewTokens, + cpuOnly: cv3CpuOnly) + return + } + + if backend == .cosyvoice3 { + guard let fixture = cv3FixturePath, let modelsDir = cv3ModelsDir else { + logger.error( + "cosyvoice3-parity requires --fixture and --models-dir " + ) + return + } + await CosyVoice3ParityCLI.run( + fixturePath: fixture, + modelsDir: modelsDir, + referencePath: cv3ReferencePath, + outputPath: output, + seed: cv3Seed, + cpuOnly: cv3CpuOnly, + replayTokens: cv3ReplayTokens) + return + } + guard let text = text else { printUsage() return diff --git a/Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift b/Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift new file mode 100644 index 000000000..e94184c40 --- /dev/null +++ b/Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift @@ -0,0 +1,81 @@ +import XCTest + +@testable import FluidAudio + +final class CosyVoice3ChineseNormalizerTests: XCTestCase { + + func testContainsChinese() { + XCTAssertTrue(CosyVoice3ChineseNormalizer.containsChinese("你好")) + XCTAssertTrue(CosyVoice3ChineseNormalizer.containsChinese("hello 世界")) + XCTAssertFalse(CosyVoice3ChineseNormalizer.containsChinese("hello world")) + XCTAssertFalse(CosyVoice3ChineseNormalizer.containsChinese("")) + } + + func testReplaceBlankDropsCjkInteriorSpaces() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceBlank("中 国"), "中国") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceBlank("hello world"), "hello world") + // Mixed: space between ASCII and CJK is dropped (one side non-ASCII). + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceBlank("hi 你好"), "hi你好") + } + + func testReplaceCornerMark() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceCornerMark("面积 5m²"), + "面积 5m平方") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceCornerMark("体积 2m³"), + "体积 2m立方") + } + + func testRemoveBracket() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.removeBracket("你好(世界)"), + "你好世界") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.removeBracket("【注意】请勿触摸"), + "注意请勿触摸") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.removeBracket("a——b"), + "a b") + } + + func testSpellOutDigitsZh() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.spellOutDigitsZh("2024年"), + "二零二四年") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.spellOutDigitsZh("abc"), + "abc") + } + + func testStripTrailingCommaLikes() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.stripTrailingCommaLikes("你好,,"), + "你好。") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.stripTrailingCommaLikes("你好、,,"), + "你好。") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.stripTrailingCommaLikes("你好。"), + "你好。") + } + + func testNormalizeEndToEnd() { + let input = "希望你以后能够做的比我还好用. 2024年,," + let out = CosyVoice3ChineseNormalizer.normalize(input) + // Period becomes 。, trailing commas collapse to a single 。, digits + // spelled out per-char, internal spaces between CJK stripped. + XCTAssertEqual(out, "希望你以后能够做的比我还好用。二零二四年。") + } + + func testIsOnlyPunctuation() { + XCTAssertTrue(CosyVoice3ChineseNormalizer.isOnlyPunctuation("")) + XCTAssertTrue(CosyVoice3ChineseNormalizer.isOnlyPunctuation("。,!?")) + XCTAssertTrue(CosyVoice3ChineseNormalizer.isOnlyPunctuation(".,!?")) + XCTAssertFalse(CosyVoice3ChineseNormalizer.isOnlyPunctuation("你好")) + XCTAssertFalse(CosyVoice3ChineseNormalizer.isOnlyPunctuation("abc")) + } +} diff --git a/Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift b/Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift new file mode 100644 index 000000000..e904d64ff --- /dev/null +++ b/Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift @@ -0,0 +1,101 @@ +import XCTest + +@testable import FluidAudio + +final class CosyVoice3PromptMelTests: XCTestCase { + + func testFrameCountMatchesMatchaFormula() throws { + // matcha/cosyvoice3: pad by 720 each side (reflect), center=False. + // For 48000 samples: padded = 48000 + 1440 = 49440. + // frames = (49440 - 1920) / 480 + 1 = 99 + 1 = 100. + let mel = CosyVoice3PromptMel() + let audio = [Float](repeating: 0.01, count: 48_000) + let out = try mel.compute(audio: audio) + XCTAssertEqual(out.frames, 100) + XCTAssertEqual(out.mel.count, 100 * 80) + } + + func testZeroAudioClampsToLogFloor() throws { + // With audio of all zeros, mel values are 0 → clamped to 1e-5 → log = -11.5129... + let mel = CosyVoice3PromptMel() + let audio = [Float](repeating: 0, count: 24_000) + let out = try mel.compute(audio: audio) + let expected: Float = log(Float(1e-5)) + for v in out.mel { + XCTAssertEqual(v, expected, accuracy: 1e-5) + } + } + + func testSinePeakInLowMelBins() throws { + // 200 Hz sine at 24 kHz should light up one of the lowest mel bins + // (fmin=0, the first few triangles cover 0..~200 Hz). + let mel = CosyVoice3PromptMel() + let sr: Float = 24_000 + let f: Float = 200 + let n = 12_000 // 0.5 s + var audio = [Float](repeating: 0, count: n) + for i in 0..0. + let numFreqBins = 1920 / 2 + 1 + for m in 0..<80 { + var sum: Float = 0 + for f in 0.. Date: Tue, 21 Apr 2026 17:32:24 -0400 Subject: [PATCH 02/17] feat(cli): add cosyvoice3-download CLI for eager HF asset fetch Wraps CosyVoice3ResourceDownloader.{ensureCoreModels,ensureTextFrontendAssets,ensureVoice} under --backend cosyvoice3-download. Pre-downloads all ~3.2 GB of HF assets (4 mlmodelcs, speech+runtime embeddings, tokenizer, default voice bundle) into ~/.cache/fluidaudio/Models/cosyvoice3/ so subsequent --backend cosyvoice3-text runs are cache hits. Verified fresh cold-start download from FluidInference/CosyVoice3-0.5B-coreml in 194s: 17/17 model files + 4/4 tokenizer files + sidecar embeddings + voice bundle all land correctly, peak mem 46 MB (streaming to disk). Co-Authored-By: Claude --- .../Commands/CosyVoice3DownloadCommand.swift | 48 +++++++++++++++++++ .../FluidAudioCLI/Commands/TTSCommand.swift | 10 ++++ 2 files changed, 58 insertions(+) create mode 100644 Sources/FluidAudioCLI/Commands/CosyVoice3DownloadCommand.swift diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3DownloadCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3DownloadCommand.swift new file mode 100644 index 000000000..6e3970986 --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/CosyVoice3DownloadCommand.swift @@ -0,0 +1,48 @@ +import FluidAudio +import Foundation + +/// Eagerly downloads every CosyVoice3 asset from HuggingFace +/// (`FluidInference/CosyVoice3-0.5B-coreml`) into `~/.cache/fluidaudio` so +/// subsequent `--backend cosyvoice3-text` runs are cache hits. +/// +/// Downloads (~5.8 GB total): +/// - 4 `.mlmodelc` bundles (LLM-Prefill, LLM-Decode, Flow, HiFT) +/// - `embeddings/speech_embedding-fp16.safetensors` +/// - `embeddings/embeddings-runtime-fp32.safetensors` (~542 MB) +/// - Tokenizer files (vocab.json, merges.txt, tokenizer_config.json, special_tokens.json) +/// - Default voice bundle (`voices/cosyvoice3-default-zh.safetensors` + `.json`) +/// +/// Usage: +/// ``` +/// fluidaudiocli tts --backend cosyvoice3-download +/// ``` +enum CosyVoice3DownloadCLI { + + private static let logger = AppLogger(category: "CosyVoice3DownloadCLI") + + static func run() async { + let tStart = Date() + logger.info("Starting CosyVoice3 asset download from HuggingFace…") + + do { + let repoDir = try await CosyVoice3ResourceDownloader.ensureCoreModels() + logger.info("Core models + speech embedding cached at: \(repoDir.path)") + + let frontend = try await CosyVoice3ResourceDownloader.ensureTextFrontendAssets( + repoDirectory: repoDir) + logger.info("Tokenizer: \(frontend.tokenizerDirectory.path)") + logger.info("Runtime embeddings: \(frontend.runtimeEmbeddingsFile.path)") + logger.info("Special tokens: \(frontend.specialTokensFile.path)") + + let voiceURL = try await CosyVoice3ResourceDownloader.ensureVoice( + repoDirectory: repoDir) + logger.info("Default voice bundle: \(voiceURL.path)") + + let elapsed = Date().timeIntervalSince(tStart) + logger.info("CosyVoice3 download complete in \(String(format: "%.1fs", elapsed))") + } catch { + logger.error("CosyVoice3 download failed: \(error)") + exit(2) + } + } +} diff --git a/Sources/FluidAudioCLI/Commands/TTSCommand.swift b/Sources/FluidAudioCLI/Commands/TTSCommand.swift index 06d291f06..af7d9d59b 100644 --- a/Sources/FluidAudioCLI/Commands/TTSCommand.swift +++ b/Sources/FluidAudioCLI/Commands/TTSCommand.swift @@ -165,6 +165,8 @@ public struct TTS { var cv3SpecialTokensFile: String? = nil var cv3PromptAssetsPath: String? = nil var cv3MaxNewTokens: Int? = nil + // CosyVoice3 eager HF-download mode. + var cv3DownloadMode: Bool = false var i = 0 while i < arguments.count { @@ -230,6 +232,9 @@ public struct TTS { case "cosyvoice3-text", "cv3-text": backend = .cosyvoice3 cv3TextMode = true + case "cosyvoice3-download", "cv3-download": + backend = .cosyvoice3 + cv3DownloadMode = true default: logger.warning("Unknown backend '\(arguments[i + 1])'; using kokoro") } @@ -338,6 +343,11 @@ public struct TTS { return } + if backend == .cosyvoice3 && cv3DownloadMode { + await CosyVoice3DownloadCLI.run() + return + } + if backend == .cosyvoice3 && cv3TokenizerParityMode { guard let tokDir = cv3TokenizerDir, let fixture = cv3FixturePath else { logger.error( From b8b8c9e32bb11f9d397d18217b33a51ce6f5684f Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Tue, 21 Apr 2026 21:39:31 -0400 Subject: [PATCH 03/17] =?UTF-8?q?perf(cosyvoice3):=20swap=20Flow=20fp32/cp?= =?UTF-8?q?uOnly=20=E2=86=92=20fp16/cpuAndGPU=20(3=C3=97=20faster)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmarked Flow across all MLComputeUnits and found the prior fp32/cpuOnly shipping config was both the slowest and heaviest: config p50 NaN fp32 cpuOnly 58,862 ms 0/3 fp32 cpuAndGPU 113,564 ms 0/3 (prior default: 2× slower than cpuOnly) fp16 cpuOnly 16,203 ms 5/5 (LayerNorm overflow) fp16 cpuAndGPU 17,261 ms 0/10 (GPU uses fp32 accumulators) fp16 cpuAndNE/all hang: MILCompilerForANE ANECCompile() FAILED Ship Flow-N250-fp16 forced to .cpuAndGPU: - 3× faster end-to-end (full e2e: 39.8s vs ~125s before on a 4.6s utterance) - mlpackage shrinks 1.2 GB → 638 MB (disk + download cut ~600 MB) - Whisper ASR roundtrip on Swift output: 13/14 chars correct on "希望你以后能够做的比我还好用" (Python fp16 e2e was 14/14 in parallel validation) ModelStore now ignores the user-supplied computeUnits for Flow and always applies .cpuAndGPU (the only viable path — cpuOnly NaNs, ANE refuses to compile). Co-Authored-By: Claude --- Sources/FluidAudio/ModelNames.swift | 2 +- .../Assets/CosyVoice3ModelStore.swift | 23 +++++++++++-------- .../Assets/CosyVoice3ResourceDownloader.swift | 2 +- .../TTS/CosyVoice3/CosyVoice3Constants.swift | 15 +++++++----- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index b689801b2..aa8a2fbf6 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -571,7 +571,7 @@ public enum ModelNames { public enum CosyVoice3 { public static let llmPrefill = "LLM-Prefill-T256-M768-fp16" public static let llmDecode = "LLM-Decode-M768-fp16" - public static let flow = "Flow-N250-fp32" + public static let flow = "Flow-N250-fp16" public static let hift = "HiFT-T500-fp16" public static let speechEmbeddings = "speech_embedding-fp16.safetensors" diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift index 75c6d7b69..4e3687d90 100644 --- a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift +++ b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift @@ -11,7 +11,7 @@ import Foundation /// /// 2. **Local mobius build dir**: `//.mlpackage` as /// emitted by `models/tts/cosyvoice3/coreml/convert-coreml.py` (with -/// `llm-fp16/`, `flow-fp32-n250/`, `hift-fp16-t500/` subdirs). +/// `llm-fp16/`, `flow-fp16-n250/`, `hift-fp16-t500/` subdirs). /// /// The store probes layout (1) first, then falls back to (2). CoreML /// auto-compiles `.mlpackage` on first load and caches the compiled bundle on @@ -28,9 +28,10 @@ public actor CosyVoice3ModelStore { /// - Parameters: /// - directory: Base build directory that contains - /// `llm-fp16/`, `flow-fp32-n250/`, `hift-fp16-t500/`, `embeddings/`. - /// - computeUnits: Defaults to `.cpuAndNeuralEngine`. Tests force - /// `.cpuOnly` for tight tolerance parity against the Python reference. + /// `llm-fp16/`, `flow-fp16-n250/`, `hift-fp16-t500/`, `embeddings/`. + /// - computeUnits: Defaults to `.cpuAndNeuralEngine`. Applied to LLM + + /// HiFT models. Flow always runs on `.cpuAndGPU` regardless (see + /// `loadIfNeeded()` for why). public init(directory: URL, computeUnits: MLComputeUnits = .cpuAndNeuralEngine) { self.directory = directory self.computeUnits = computeUnits @@ -68,12 +69,16 @@ public actor CosyVoice3ModelStore { let decode = try await compileAndLoad(decodeURL, configuration: config) logger.info("Loaded \(CosyVoice3Constants.Files.llmDecode)") - // Flow is fp32; ANE cannot run the full graph. If the caller asked for - // CPU-only (parity harness), honor it so results match the Python - // reference byte-for-byte. Otherwise use CPU+GPU to avoid silent ANE - // fallback warnings. + // Flow is fp16 and MUST run on `.cpuAndGPU`: + // - pure CPU overflows the fused LayerNorm and emits all-NaN mel + // (empirically 5/5 NaN across random inputs) + // - ANE refuses to compile the graph (MILCompilerForANE + // `ANECCompile() FAILED`), so `.cpuAndNE` / `.all` deadlock load + // - GPU path uses fp32 accumulators internally and is stable + // Ignore the user-supplied `computeUnits` for Flow; apply it to the + // LLM + HiFT models only. let flowConfig = MLModelConfiguration() - flowConfig.computeUnits = (computeUnits == .cpuOnly) ? .cpuOnly : .cpuAndGPU + flowConfig.computeUnits = .cpuAndGPU let flow = try await compileAndLoad(flowURL, configuration: flowConfig) logger.info("Loaded \(CosyVoice3Constants.Files.flow)") diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift index 0776b2b86..7359ddd43 100644 --- a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift +++ b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift @@ -9,7 +9,7 @@ import Foundation /// / /// ├── LLM-Prefill-T256-M768-fp16.mlmodelc/ /// ├── LLM-Decode-M768-fp16.mlmodelc/ -/// ├── Flow-N250-fp32.mlmodelc/ +/// ├── Flow-N250-fp16.mlmodelc/ /// ├── HiFT-T500-fp16.mlmodelc/ /// ├── embeddings/ /// │ ├── speech_embedding-fp16.safetensors diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift index a7d03a450..e10353925 100644 --- a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift @@ -3,10 +3,13 @@ import Foundation /// Central constants for the CosyVoice3 (Mandarin) CoreML pipeline. /// /// Shipping config (frozen): -/// - LLM-Prefill-T256-M768-fp16 -/// - LLM-Decode-M768-fp16 -/// - Flow-N250-fp32 (fp16 causes NaN; fused `layer_norm` cannot be pinned) -/// - HiFT-T500-fp16 +/// - LLM-Prefill-T256-M768-fp16 (cpuAndNeuralEngine) +/// - LLM-Decode-M768-fp16 (cpuAndNeuralEngine) +/// - Flow-N250-fp16 (cpuAndGPU — pure CPU overflows fused +/// LayerNorm → NaN; ANE refuses to compile; GPU path uses fp32 accumulators +/// internally and is stable + ~3× faster than the previous fp32/cpuOnly +/// shipping config) +/// - HiFT-T500-fp16 (cpuAndNeuralEngine) public enum CosyVoice3Constants { // MARK: - LLM shapes @@ -52,8 +55,8 @@ public enum CosyVoice3Constants { public static let llmPrefillSubdir = "llm-fp16" public static let llmDecode = "LLM-Decode-M768-fp16.mlpackage" public static let llmDecodeSubdir = "llm-fp16" - public static let flow = "Flow-N250-fp32.mlpackage" - public static let flowSubdir = "flow-fp32-n250" + public static let flow = "Flow-N250-fp16.mlpackage" + public static let flowSubdir = "flow-fp16-n250" public static let hift = "HiFT-T500-fp16.mlpackage" public static let hiftSubdir = "hift-fp16-t500" public static let speechEmbeddings = "speech_embedding-fp16.safetensors" From fa2112915d909b1e85135189fc544b6a795a35a5 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Wed, 22 Apr 2026 01:29:32 -0400 Subject: [PATCH 04/17] perf(tts/cosyvoice3): adopt stateful LLM-Decode via MLState Swap the CosyVoice3 decode path to the new stateful mlpackage shipped from the mobius repo. The 24-layer KV cache (48 per-layer buffers, [1, 2, 768, 64] fp16 each) is now held inside a CoreML `MLState` and mutated in place across decode steps via `withMultiArray(for:)`, so the synthesizer no longer passes kv_k / kv_v MLMultiArrays in and out every step. - Package.swift: bump platforms to macOS 15 / iOS 18 (required for CoreML StateType). - ModelNames.swift: rename llmDecode to `LLM-Decode-M768-fp16-stateful`. - CosyVoice3Constants.swift: update filename + subdir, document the cpuAndGPU constraint (ANE refuses stateful graph compile, same as Flow). - CosyVoice3ModelStore.swift: load decode with `.cpuAndGPU` explicitly. - CosyVoice3Synthesizer.swift: seed state from prefill kv_k_out / kv_v_out (fp32 at CoreML I/O, cast to fp16 at the state boundary); reusable per-step `inputs_embeds` + `cur_len` MLMultiArrays; call `prediction(from:using:)` with the MLState. - CosyVoice3SpeechEmbeddings.swift: add `copyEmbedding(tokenId:into:)` so the hot decode loop can reuse a single scratch MLMultiArray instead of allocating per step. Parity: end-to-end WAV output identical in length (83520 samples) to both the Python reference and the prior pass-through Swift output; no regression in sample-level metrics. Co-Authored-By: Claude --- Package.swift | 4 +- Sources/FluidAudio/ModelNames.swift | 2 +- .../Assets/CosyVoice3ModelStore.swift | 19 +- .../TTS/CosyVoice3/CosyVoice3Constants.swift | 19 +- .../CosyVoice3SpeechEmbeddings.swift | 15 +- .../Synthesize/CosyVoice3Synthesizer.swift | 169 +++++++++++++++--- 6 files changed, 181 insertions(+), 47 deletions(-) diff --git a/Package.swift b/Package.swift index 38737110f..3126f2533 100644 --- a/Package.swift +++ b/Package.swift @@ -4,8 +4,8 @@ import PackageDescription let package = Package( name: "FluidAudio", platforms: [ - .macOS(.v14), - .iOS(.v17), + .macOS(.v15), + .iOS(.v18), ], products: [ .library( diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index aa8a2fbf6..b8caa0c73 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -570,7 +570,7 @@ public enum ModelNames { /// expected local directory layout is encoded in `CosyVoice3Constants.Files`. public enum CosyVoice3 { public static let llmPrefill = "LLM-Prefill-T256-M768-fp16" - public static let llmDecode = "LLM-Decode-M768-fp16" + public static let llmDecode = "LLM-Decode-M768-fp16-stateful" public static let flow = "Flow-N250-fp16" public static let hift = "HiFT-T500-fp16" public static let speechEmbeddings = "speech_embedding-fp16.safetensors" diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift index 4e3687d90..ed641c496 100644 --- a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift +++ b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift @@ -28,10 +28,11 @@ public actor CosyVoice3ModelStore { /// - Parameters: /// - directory: Base build directory that contains - /// `llm-fp16/`, `flow-fp16-n250/`, `hift-fp16-t500/`, `embeddings/`. - /// - computeUnits: Defaults to `.cpuAndNeuralEngine`. Applied to LLM + - /// HiFT models. Flow always runs on `.cpuAndGPU` regardless (see - /// `loadIfNeeded()` for why). + /// `llm-fp16/`, `llm-fp16-stateful/`, `flow-fp16-n250/`, + /// `hift-fp16-t500/`, `embeddings/`. + /// - computeUnits: Defaults to `.cpuAndNeuralEngine`. Applied to + /// LLM-Prefill + HiFT models only. LLM-Decode (stateful) and Flow + /// both force `.cpuAndGPU` regardless (see `loadIfNeeded()`). public init(directory: URL, computeUnits: MLComputeUnits = .cpuAndNeuralEngine) { self.directory = directory self.computeUnits = computeUnits @@ -66,7 +67,15 @@ public actor CosyVoice3ModelStore { let prefill = try await compileAndLoad(prefillURL, configuration: config) logger.info("Loaded \(CosyVoice3Constants.Files.llmPrefill)") - let decode = try await compileAndLoad(decodeURL, configuration: config) + // Stateful decode MUST run on `.cpuAndGPU`: + // - ANE refuses to compile the stateful graph (same failure mode + // as Flow: `MILCompilerForANE ANECCompile() FAILED`), so + // `.cpuAndNE` / `.all` deadlock load + // - CPU-only works but is ~2× slower than the GPU path + // Ignore the user-supplied `computeUnits` for decode. + let decodeConfig = MLModelConfiguration() + decodeConfig.computeUnits = .cpuAndGPU + let decode = try await compileAndLoad(decodeURL, configuration: decodeConfig) logger.info("Loaded \(CosyVoice3Constants.Files.llmDecode)") // Flow is fp16 and MUST run on `.cpuAndGPU`: diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift index e10353925..bd987bbdd 100644 --- a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift @@ -3,13 +3,20 @@ import Foundation /// Central constants for the CosyVoice3 (Mandarin) CoreML pipeline. /// /// Shipping config (frozen): -/// - LLM-Prefill-T256-M768-fp16 (cpuAndNeuralEngine) -/// - LLM-Decode-M768-fp16 (cpuAndNeuralEngine) -/// - Flow-N250-fp16 (cpuAndGPU — pure CPU overflows fused +/// - LLM-Prefill-T256-M768-fp16 (cpuAndNeuralEngine) +/// - LLM-Decode-M768-fp16-stateful (cpuAndGPU — see note) +/// - Flow-N250-fp16 (cpuAndGPU — pure CPU overflows fused /// LayerNorm → NaN; ANE refuses to compile; GPU path uses fp32 accumulators /// internally and is stable + ~3× faster than the previous fp32/cpuOnly /// shipping config) -/// - HiFT-T500-fp16 (cpuAndNeuralEngine) +/// - HiFT-T500-fp16 (cpuAndNeuralEngine) +/// +/// The stateful decode model uses per-layer `MLState` buffers for the +/// KV cache (48 tensors, `[1, 2, 768, 64]` fp16 each) instead of +/// round-tripping 18 MB of kv_k / kv_v MLMultiArrays every step. ANE +/// refuses to compile the stateful graph (`MILCompilerForANE +/// ANECCompile() FAILED`), mirroring Flow — decode therefore runs on +/// `.cpuAndGPU`. Requires macOS 15 / iOS 18. public enum CosyVoice3Constants { // MARK: - LLM shapes @@ -53,8 +60,8 @@ public enum CosyVoice3Constants { public enum Files { public static let llmPrefill = "LLM-Prefill-T256-M768-fp16.mlpackage" public static let llmPrefillSubdir = "llm-fp16" - public static let llmDecode = "LLM-Decode-M768-fp16.mlpackage" - public static let llmDecodeSubdir = "llm-fp16" + public static let llmDecode = "LLM-Decode-M768-fp16-stateful.mlpackage" + public static let llmDecodeSubdir = "llm-fp16-stateful" public static let flow = "Flow-N250-fp16.mlpackage" public static let flowSubdir = "flow-fp16-n250" public static let hift = "HiFT-T500-fp16.mlpackage" diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift index 110ec02aa..4de16fbe4 100644 --- a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift @@ -42,13 +42,21 @@ public final class CosyVoice3SpeechEmbeddings: @unchecked Sendable { /// for `tokenId`, converted from fp16. Allocates fresh each call — the /// LLM decode step owns the tensor for exactly one prediction. public func embedding(tokenId: Int32) throws -> MLMultiArray { + let array = try MLMultiArray( + shape: [1, 1, NSNumber(value: embedDim)], + dataType: .float32) + try copyEmbedding(tokenId: tokenId, into: array) + return array + } + + /// Copy the fp16 embedding row for `tokenId` into an existing + /// `[1, 1, embedDim]` fp32 MLMultiArray. Avoids the per-step allocation + /// of `embedding(tokenId:)` in the hot decode loop. + public func copyEmbedding(tokenId: Int32, into array: MLMultiArray) throws { guard tokenId >= 0 && Int(tokenId) < numTokens else { throw CosyVoice3Error.invalidShape( "speech token id \(tokenId) out of range [0, \(numTokens))") } - let array = try MLMultiArray( - shape: [1, 1, NSNumber(value: embedDim)], - dataType: .float32) let rowStart = Int(tokenId) * rowByteSize let dim = embedDim let lastStride = array.strides.last?.intValue ?? 1 @@ -60,6 +68,5 @@ public final class CosyVoice3SpeechEmbeddings: @unchecked Sendable { dstPtr[i * lastStride] = Float(fp16Ptr[i]) } } - return array } } diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Synthesizer.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Synthesizer.swift index a1043adf1..e4077c975 100644 --- a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Synthesizer.swift +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Synthesizer.swift @@ -6,6 +6,11 @@ import Foundation /// Mirrors `verify/test_coreml_e2e_fp16.py::main()` in Python. Each stage is /// implemented as a method on this type, keeping the state (KV cache, running /// decoded list) local to a single synthesis call. +/// +/// Decode uses CoreML `MLState` (macOS 15 / iOS 18): 48 per-layer buffers +/// (`kv_k_0..kv_k_23`, `kv_v_0..kv_v_23`) replace the 18 MB kv_k / kv_v +/// round-trip per step. Prefill remains non-stateful and its `kv_k` / `kv_v` +/// outputs seed the decode state once after prefill. public final class CosyVoice3Synthesizer: @unchecked Sendable { private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "CosyVoice3Synthesizer") @@ -40,10 +45,19 @@ public final class CosyVoice3Synthesizer: @unchecked Sendable { sampler.seedTokens(fixture.decodedTokens) } - // 1) Prefill + // 1) Prefill (non-stateful: returns kv_k / kv_v as outputs) let (prefillLogits, initialKvK, initialKvV) = try await runPrefill(fixture: fixture) - var kvK = initialKvK - var kvV = initialKvV + + // Seed decode MLState from prefill kv_k / kv_v. + let state = models.decode.makeState() + try seedDecodeState(state: state, kvK: initialKvK, kvV: initialKvV) + + // Reusable per-step inputs for decode. `curLenArr` is mutated in place + // each step; `inputsEmbedsArr` is overwritten by memcpy per step. + let curLenArr = try MLMultiArray(shape: [1], dataType: .int32) + let inputsEmbedsArr = try MLMultiArray( + shape: [1, 1, NSNumber(value: CosyVoice3Constants.embedDim)], + dataType: .float32) // First token from prefill tail logits. var decoded: [Int32] = [] @@ -61,14 +75,12 @@ public final class CosyVoice3Synthesizer: @unchecked Sendable { // 2) Decode loop var curLen = fixture.tPre for step in 1.. (logits: [Float], kvK: MLMultiArray, kvV: MLMultiArray) { - let curLenArr = try MLMultiArray(shape: [1], dataType: .int32) - curLenArr[0] = NSNumber(value: curLen) - + curLen: MLMultiArray, + state: MLState + ) throws -> [Float] { let features: [String: Any] = [ "inputs_embeds": inputsEmbeds, - "kv_k": kvK, - "kv_v": kvV, - "cur_len": curLenArr, + "cur_len": curLen, ] let provider = try MLDictionaryFeatureProvider(dictionary: features) - let output = try await models.decode.compatPrediction( - from: provider, options: MLPredictionOptions()) + let output = try models.decode.prediction(from: provider, using: state) guard - let logitsArr = output.featureValue(for: "speech_logits")?.multiArrayValue, - let newKvK = output.featureValue(for: "kv_k_out")?.multiArrayValue, - let newKvV = output.featureValue(for: "kv_v_out")?.multiArrayValue + let logitsArr = output.featureValue(for: "speech_logits")?.multiArrayValue else { - throw CosyVoice3Error.predictionFailed("decode: missing outputs") + throw CosyVoice3Error.predictionFailed("decode: missing speech_logits") } // logits shape = [1, 1, 6761] fp32; strides may be non-compact. let count = CosyVoice3Constants.speechVocab @@ -189,7 +195,112 @@ public final class CosyVoice3Synthesizer: @unchecked Sendable { let vocabStride = strides.last ?? 1 let base = logitsArr.dataPointer.bindMemory(to: Float.self, capacity: logitsArr.count) for i in 0.., + srcLayerBase: Int, + srcHStride: Int, srcMStride: Int, srcDStride: Int, + dst: UnsafeMutablePointer, + dstHStride: Int, dstMStride: Int, dstDStride: Int, + H: Int, M: Int, D: Int + ) { + for h in 0.. Date: Sun, 26 Apr 2026 12:34:06 -0400 Subject: [PATCH 05/17] feat(tts/pocket): add multi-language pack support Wire PocketTTS up to the language packs Kyutai now publishes under `languages//` (english, french_24l, german[_24l], italian[_24l], portuguese[_24l], spanish[_24l]). English keeps the legacy root layout for zero-breaking-change to existing users; new languages download only the requested `languages//` subtree from the HF repo. - PocketTtsLanguage enum + ModelNames.requiredModels(for:) thread the language root through the downloader, model store, and session. - 6L vs 24L variants differ only in transformer layer count; layer keys are now discovered at runtime via PocketTtsLayerKeys instead of being hard-coded to 6. - PocketTtsMimiSchema captures Mimi decoder I/O (legacy English uses mimi_decoder_v2.mlmodelc, other languages use mimi_decoder.mlmodelc). - Constants loader scopes to a per-language `constants_bin/` so each pack carries its own tokenizer + text embed table + voice prompts. - CLI: --language flag (validates against PocketTtsLanguage.allCases), default english. - Cross-language voice cloning still works: mimi_encoder is shared, cloned embeddings can be paired with any language pack. Out of scope for v1: runtime language switching on a live manager (instantiate a new manager instead), French 6L (upstream only ships 24L), automatic language detection from text. --- Documentation/TTS/PocketTTS.md | 74 ++++++ README.md | 21 +- Sources/FluidAudio/ModelNames.swift | 69 +++++- .../Assets/PocketTtsResourceDownloader.swift | 220 ++++++++++++++++-- .../Pipeline/PocketTtsLayerKeys.swift | 187 +++++++++++++++ .../Pipeline/PocketTtsMimiSchema.swift | 182 +++++++++++++++ .../Pipeline/PocketTtsModelStore.swift | 185 ++++++++++++--- .../PocketTTS/Pipeline/PocketTtsSession.swift | 18 +- .../PocketTtsSynthesizer+KVCache.swift | 76 +++--- .../Pipeline/PocketTtsSynthesizer+Mimi.swift | 144 +++++++----- .../Pipeline/PocketTtsSynthesizer+Types.swift | 87 +------ .../Pipeline/PocketTtsSynthesizer.swift | 80 +++++-- .../TTS/PocketTTS/PocketTtsConstants.swift | 116 ++++++++- .../TTS/PocketTTS/PocketTtsManager.swift | 24 +- .../FluidAudioCLI/Commands/TTSCommand.swift | 115 ++++++++- .../PocketTTS/PocketTtsLanguageTests.swift | 145 ++++++++++++ .../TTS/PocketTTS/PocketTtsSessionTests.swift | 6 +- 17 files changed, 1485 insertions(+), 264 deletions(-) create mode 100644 Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsLayerKeys.swift create mode 100644 Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsMimiSchema.swift create mode 100644 Tests/FluidAudioTests/TTS/PocketTTS/PocketTtsLanguageTests.swift diff --git a/Documentation/TTS/PocketTTS.md b/Documentation/TTS/PocketTTS.md index 27659910c..bfd7dad32 100644 --- a/Documentation/TTS/PocketTTS.md +++ b/Documentation/TTS/PocketTTS.md @@ -145,6 +145,29 @@ fluidaudio tts "Hello world" --backend pocket --voice-file my_voice.bin - The `mimi_encoder.mlmodelc` model is downloaded automatically on first use - Supports any audio format that AVFoundation can read +### Cloning Across Languages + +The Mimi encoder is language-agnostic — voice cloning produces a generic +acoustic embedding that any language pack's `cond_step` model can consume. +You can: + +- Clone a voice once and reuse the same `PocketTtsVoiceData` across managers + configured with different languages. +- Clone a voice with a Spanish-only manager without pulling in the English + language pack — only the encoder subtree is downloaded. + +```swift +// Clone with a Spanish manager +let esManager = PocketTtsManager(language: .spanish) +try await esManager.initialize() +let voiceData = try await esManager.cloneVoice(from: speakerAudioURL) + +// Use the same cloned voice with a French manager +let frManager = PocketTtsManager(language: .french24L) +try await frManager.initialize() +let frAudio = try await frManager.synthesize(text: "Bonjour", voiceData: voiceData) +``` + ## Pipeline and Pronunciation Control ``` @@ -214,6 +237,57 @@ for try await frame in session.frames { | Streaming playback | `synthesizeStreaming()` | | Streaming text or custom chunking | `makeSession()` | +## Languages + +PocketTTS ships with multiple language packs converted from +[kyutai/pocket-tts](https://huggingface.co/kyutai/pocket-tts). Pick the one +that matches your input text — there is no automatic language detection. + +| ID | Layers | HF Path | +|----|--------|---------| +| `english` | 6 | repo root (legacy layout) | +| `german` | 6 | `v2/german/` | +| `german_24l` | 24 | `v2/german_24l/` | +| `italian` | 6 | `v2/italian/` | +| `italian_24l` | 24 | `v2/italian_24l/` | +| `portuguese` | 6 | `v2/portuguese/` | +| `portuguese_24l` | 24 | `v2/portuguese_24l/` | +| `spanish` | 6 | `v2/spanish/` | +| `spanish_24l` | 24 | `v2/spanish_24l/` | +| `french_24l` | 24 | `v2/french_24l/` | + +Notes: +- French only ships a 24-layer pack upstream (no 6-layer variant). +- 24-layer packs are higher quality but slower and larger. +- The 21 voice names (alba, anna, eve, michael, …) are shared across + languages, but the underlying acoustic embeddings are per-language. +- Mimi encoder weights (used for voice cloning) are language-agnostic and + always live at the repo root. + +### Swift API + +```swift +let manager = PocketTtsManager(language: .spanish) +try await manager.initialize() +let audio = try await manager.synthesize(text: "Hola mundo") +``` + +`PocketTtsManager.language` is immutable per instance. To support multiple +languages in one app, instantiate one manager per language. + +### CLI Usage + +```bash +# Default (English) +fluidaudio tts "Hello world" --backend pocket --output en.wav + +# Spanish (6L) +fluidaudio tts "Hola mundo" --backend pocket --language spanish --output es.wav + +# French (24L only) +fluidaudio tts "Bonjour" --backend pocket --language french_24l --output fr.wav +``` + ## Usage PocketTTS is part of core `FluidAudio` - no GPL dependencies required. diff --git a/README.md b/README.md index d49573303..002c046c5 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Want to convert your own model? Check [möbius](https://github.com/FluidInferenc - **Automatic Speech Recognition (ASR)**: [Parakeet TDT v3](Documentation/Models.md#batch-transcription-near-real-time) (0.6b) and other TDT/CTC models for batch transcription supporting 25 European languages, Japanese, and Chinese; [Parakeet EOU](Documentation/Models.md#streaming-transcription-true-real-time) (120m) for streaming ASR with end-of-utterance detection (English only). See all [ASR models](Documentation/Models.md#asr-models). - **Inverse Text Normalization (ITN)**: Post-process ASR output to convert spoken-form to written-form ("two hundred" → "200"). See [text-processing-rs](https://github.com/FluidInference/text-processing-rs) -- **Text-to-Speech (TTS)**: Kokoro (82m) for parallel synthesis with SSML and pronunciation control across 9 languages (EN, ES, FR, HI, IT, JA, PT, ZH); PocketTTS for streaming TTS with voice cloning support (English only) +- **Text-to-Speech (TTS)**: Kokoro (82m) for parallel synthesis with SSML and pronunciation control across 9 languages (EN, ES, FR, HI, IT, JA, PT, ZH); PocketTTS for streaming TTS with voice cloning support (EN, DE, ES, FR, IT, PT — 6L and 24L variants) - **Speaker Diarization (Online + Offline)**: Speaker separation and identification across audio streams. Streaming pipeline for real-time processing and offline batch pipeline with advanced clustering. - **Speaker Embedding Extraction**: Generate speaker embeddings for voice comparison and clustering, you can use this for speaker identification - **Voice Activity Detection (VAD)**: Voice activity detection with Silero models @@ -556,25 +556,36 @@ FluidAudio ships two TTS backends: ### PocketTTS Streaming-friendly TTS with voice cloning support from short audio samples. +Available language packs: `english` (default), `german`, `german_24l`, +`italian`, `italian_24l`, `portuguese`, `portuguese_24l`, `spanish`, +`spanish_24l`, `french_24l` (24-layer only — no 6-layer French upstream). ```swift import FluidAudio Task { - let manager = try await PocketTtsManager() - let audioData = try await manager.synthesize("Hello from FluidAudio.") + let manager = PocketTtsManager(language: .spanish) + try await manager.initialize() + let audioData = try await manager.synthesize(text: "Hola, mundo.") try audioData.write(to: URL(fileURLWithPath: "out.wav")) } ``` ```bash -# Synthesize with default voice +# English (default) swift run fluidaudiocli tts "Hello from FluidAudio." --output out.wav --backend pocket -# Clone a voice from an audio sample +# Other languages +swift run fluidaudiocli tts "Hola mundo" --backend pocket --language spanish --output es.wav +swift run fluidaudiocli tts "Bonjour" --backend pocket --language french_24l --output fr.wav + +# Clone a voice from an audio sample (works with any language pack) swift run fluidaudiocli tts "Hello world." --output out.wav --backend pocket --clone-voice speaker.wav ``` +See [Documentation/TTS/PocketTTS.md](Documentation/TTS/PocketTTS.md#languages) +for the full language table. + ### Kokoro High-quality parallel TTS with SSML and phoneme-level pronunciation control. Uses a CoreML G2P (grapheme-to-phoneme) model for out-of-vocabulary words — no external dependencies required. diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index b8caa0c73..9994ba7a8 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -539,30 +539,81 @@ public enum ModelNames { public static let condStep = "cond_step" public static let flowlmStep = "flowlm_step" public static let flowDecoder = "flow_decoder" - public static let mimiDecoder = "mimi_decoder_v2" + /// Legacy English (root-of-repo) Mimi decoder file basename. + public static let mimiDecoderLegacy = "mimi_decoder_v2" + /// New per-language `v2//` Mimi decoder file basename. + public static let mimiDecoderV2 = "mimi_decoder" public static let mimiEncoder = "mimi_encoder" public static let condStepFile = condStep + ".mlmodelc" public static let flowlmStepFile = flowlmStep + ".mlmodelc" public static let flowDecoderFile = flowDecoder + ".mlmodelc" - public static let mimiDecoderFile = mimiDecoder + ".mlmodelc" + public static let mimiDecoderLegacyFile = mimiDecoderLegacy + ".mlmodelc" + public static let mimiDecoderV2File = mimiDecoderV2 + ".mlmodelc" public static let mimiEncoderFile = mimiEncoder + ".mlmodelc" + /// Backward-compatible alias used by callers expecting the English + /// (legacy root) Mimi decoder filename. + public static let mimiDecoder = mimiDecoderLegacy + public static let mimiDecoderFile = mimiDecoderLegacyFile + /// Directory containing binary constants, tokenizer, and voice data. public static let constantsBinDir = "constants_bin" - public static let requiredModels: Set = [ - condStepFile, - flowlmStepFile, - flowDecoderFile, - mimiDecoderFile, - constantsBinDir, - ] + /// Returns the Mimi decoder filename used inside this language's pack. + public static func mimiDecoderFile(for language: PocketTtsLanguage) -> String { + language == .english ? mimiDecoderLegacyFile : mimiDecoderV2File + } + + /// Required models inside the language root for the given language. + /// + /// English (legacy root) and other languages use different Mimi + /// decoder filenames, but all four model directories plus the + /// `constants_bin/` directory must be present. + public static func requiredModels(for language: PocketTtsLanguage) -> Set { + [ + condStepFile, + flowlmStepFile, + flowDecoderFile, + mimiDecoderFile(for: language), + constantsBinDir, + ] + } + + /// Backward-compatible English-only set (legacy root layout). + public static let requiredModels: Set = requiredModels(for: .english) /// Models required for voice cloning (optional feature). public static let voiceCloningModels: Set = [ mimiEncoderFile ] + + // MARK: - Int8 (weight-only quantized) variants + + /// Filename for the int8 variant of `cond_step` (always + /// `cond_step.mlmodelc`, regardless of language). + public static let condStepInt8File = condStepFile + + /// Filename for the int8 variant of `flowlm_step`. + public static let flowlmStepInt8File = flowlmStepFile + + /// Filename for the int8 variant of `flow_decoder`. + public static let flowDecoderInt8File = flowDecoderFile + + /// Filename for the int8 variant of `mimi_decoder`. + /// + /// Note: int8 packs always use the simple `mimi_decoder.mlmodelc` + /// basename (no `_v2` suffix), even for English — they were exported + /// from the unified v2 mlpackage. + public static let mimiDecoderInt8File = mimiDecoderV2File + + /// Returns the basename of an int8 submodel `.mlmodelc` for the given + /// fp16 basename. The mapping is identity for cond_step / flowlm_step + /// / flow_decoder, and rewrites `mimi_decoder_v2` → `mimi_decoder` for + /// English. + public static func int8File(forFp16File fp16File: String) -> String { + fp16File == mimiDecoderLegacyFile ? mimiDecoderV2File : fp16File + } } /// CosyVoice3 (Mandarin) model names. Files live on HuggingFace at diff --git a/Sources/FluidAudio/TTS/PocketTTS/Assets/PocketTtsResourceDownloader.swift b/Sources/FluidAudio/TTS/PocketTTS/Assets/PocketTtsResourceDownloader.swift index db44911e4..75a1029dc 100644 --- a/Sources/FluidAudio/TTS/PocketTTS/Assets/PocketTtsResourceDownloader.swift +++ b/Sources/FluidAudio/TTS/PocketTTS/Assets/PocketTtsResourceDownloader.swift @@ -1,52 +1,208 @@ import Foundation import OSLog +/// Resolved on-disk locations for PocketTTS submodels, honoring the requested +/// per-submodel quantization configuration. +public struct PocketTtsResolvedModels: Sendable { + /// Local cache directory mirroring the HuggingFace repo root. + public let repoDir: URL + /// Language root: legacy repo root for English, `/v2//` otherwise. + /// Always contains `constants_bin/` regardless of quantization choices. + public let languageRoot: URL + /// Resolved `.mlmodelc` URL for `cond_step` (fp16 or int8). + public let condStepURL: URL + /// Resolved `.mlmodelc` URL for `flowlm_step` (fp16 or int8). + public let flowlmStepURL: URL + /// Resolved `.mlmodelc` URL for `flow_decoder` (fp16 or int8). + public let flowDecoderURL: URL + /// Resolved `.mlmodelc` URL for `mimi_decoder` (fp16 or int8). + public let mimiDecoderURL: URL + /// Quantization that was applied to resolve the URLs above. + public let quantization: PocketTtsQuantization +} + /// Downloads PocketTTS models and constants from HuggingFace. public enum PocketTtsResourceDownloader { private static let logger = AppLogger(category: "PocketTtsResourceDownloader") - /// Ensure all PocketTTS models are downloaded and return the cache directory. + /// Ensure all PocketTTS models for the given language are downloaded and + /// return the **language root** directory. + /// + /// Backwards-compatible overload: defaults to all-fp16 quantization and + /// returns just the language root. New callers should prefer + /// ``ensureResolvedModels(language:quantization:directory:progressHandler:)``. /// /// - Parameters: + /// - language: Which upstream language pack to fetch. /// - directory: Optional override for the base cache directory. /// When `nil`, uses the default platform cache location. /// - progressHandler: Optional callback for download progress updates. + /// - Returns: The directory that contains the four `.mlmodelc` packages + /// plus `constants_bin/` for the requested language. For English this + /// is the legacy repo root; for other languages it's + /// `/v2//`. public static func ensureModels( + language: PocketTtsLanguage = .english, directory: URL? = nil, progressHandler: DownloadUtils.ProgressHandler? = nil ) async throws -> URL { + let resolved = try await ensureResolvedModels( + language: language, + quantization: .allFp16, + directory: directory, + progressHandler: progressHandler + ) + return resolved.languageRoot + } + + /// Ensure all PocketTTS submodels and constants are downloaded for the + /// given language and quantization configuration, and return resolved + /// per-submodel URLs. + /// + /// The fp16 language pack (and its `constants_bin/`) is always ensured, + /// since constants and tokenizer live there. For each submodel marked + /// `int8`, the corresponding subtree under + /// `languages//int8/.mlmodelc` is fetched on top. + public static func ensureResolvedModels( + language: PocketTtsLanguage = .english, + quantization: PocketTtsQuantization = .allFp16, + directory: URL? = nil, + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> PocketTtsResolvedModels { let targetDir = try directory ?? cacheDirectory() let modelsDirectory = targetDir.appendingPathComponent( PocketTtsConstants.defaultModelsSubdirectory) let repoDir = modelsDirectory.appendingPathComponent(Repo.pocketTts.folderName) - // Check that all required directories exist (models + constants_bin) - let requiredModels = ModelNames.PocketTTS.requiredModels + let languageRoot: URL + if let subdir = language.repoSubdirectory { + languageRoot = repoDir.appendingPathComponent(subdir) + } else { + languageRoot = repoDir + } + + // Always ensure the fp16 language pack — constants_bin/ lives there + // and is needed regardless of which submodels are int8. + let requiredModels = ModelNames.PocketTTS.requiredModels(for: language) let allPresent = requiredModels.allSatisfy { model in FileManager.default.fileExists( - atPath: repoDir.appendingPathComponent(model).path) + atPath: languageRoot.appendingPathComponent(model).path) } if !allPresent { - logger.info("Downloading PocketTTS models from HuggingFace...") - try await DownloadUtils.downloadRepo(.pocketTts, to: modelsDirectory, progressHandler: progressHandler) + if let subdir = language.repoSubdirectory { + logger.info( + "Downloading PocketTTS \(language.rawValue) language pack from HuggingFace (\(subdir))..." + ) + try await DownloadUtils.downloadSubdirectory( + .pocketTts, + subdirectory: subdir, + to: repoDir + ) + } else { + logger.info("Downloading PocketTTS English models from HuggingFace...") + try await DownloadUtils.downloadRepo( + .pocketTts, to: modelsDirectory, progressHandler: progressHandler) + } } else { - logger.info("PocketTTS models found in cache") + logger.info( + "PocketTTS \(language.rawValue) models found in cache") + } + + // Fetch any int8 variants requested. Each int8 submodel's mlmodelc is + // a self-contained subtree at `languages//int8/.mlmodelc`. + if quantization.hasAnyInt8 { + let int8RemoteRoot = language.int8RepoSubdirectory + let int8LocalRoot = repoDir.appendingPathComponent(int8RemoteRoot) + + var int8Files: [String] = [] + if quantization.condStep == .int8 { + int8Files.append(ModelNames.PocketTTS.condStepInt8File) + } + if quantization.flowlmStep == .int8 { + int8Files.append(ModelNames.PocketTTS.flowlmStepInt8File) + } + if quantization.flowDecoder == .int8 { + int8Files.append(ModelNames.PocketTTS.flowDecoderInt8File) + } + if quantization.mimiDecoder == .int8 { + int8Files.append(ModelNames.PocketTTS.mimiDecoderInt8File) + } + + for file in int8Files { + let localPath = int8LocalRoot.appendingPathComponent(file) + if FileManager.default.fileExists(atPath: localPath.path) { + continue + } + let remoteSub = "\(int8RemoteRoot)/\(file)" + logger.info( + "Downloading PocketTTS \(language.rawValue) int8 submodel: \(remoteSub)..." + ) + try await DownloadUtils.downloadSubdirectory( + .pocketTts, + subdirectory: remoteSub, + to: repoDir + ) + } + } + + // Resolve per-submodel URLs. + let int8Root = repoDir.appendingPathComponent(language.int8RepoSubdirectory) + + func resolved(_ precision: PocketTtsModelPrecision, fp16File: String, int8File: String) -> URL { + switch precision { + case .fp16: + return languageRoot.appendingPathComponent(fp16File) + case .int8: + return int8Root.appendingPathComponent(int8File) + } } - return repoDir + let condURL = resolved( + quantization.condStep, + fp16File: ModelNames.PocketTTS.condStepFile, + int8File: ModelNames.PocketTTS.condStepInt8File) + let flowlmURL = resolved( + quantization.flowlmStep, + fp16File: ModelNames.PocketTTS.flowlmStepFile, + int8File: ModelNames.PocketTTS.flowlmStepInt8File) + let flowURL = resolved( + quantization.flowDecoder, + fp16File: ModelNames.PocketTTS.flowDecoderFile, + int8File: ModelNames.PocketTTS.flowDecoderInt8File) + let mimiURL = resolved( + quantization.mimiDecoder, + fp16File: ModelNames.PocketTTS.mimiDecoderFile(for: language), + int8File: ModelNames.PocketTTS.mimiDecoderInt8File) + + return PocketTtsResolvedModels( + repoDir: repoDir, + languageRoot: languageRoot, + condStepURL: condURL, + flowlmStepURL: flowlmURL, + flowDecoderURL: flowURL, + mimiDecoderURL: mimiURL, + quantization: quantization + ) } /// Ensure the Mimi encoder model is downloaded for voice cloning. /// /// This is an optional model that's only needed for voice cloning functionality. /// It's downloaded separately from the main models to reduce initial download size. + /// The encoder is shared across all language packs and lives at the legacy + /// repo root regardless of which language is currently loaded — so a Spanish + /// (or any non-English) user can clone a voice without pulling in the + /// English language pack. /// - Parameter directory: Optional override for the base cache directory. /// When `nil`, uses the default platform cache location. public static func ensureMimiEncoder(directory: URL? = nil) async throws -> URL { - let repoDir = try await ensureModels(directory: directory) + let targetDir = try directory ?? cacheDirectory() + let modelsDirectory = targetDir.appendingPathComponent( + PocketTtsConstants.defaultModelsSubdirectory) + let repoDir = modelsDirectory.appendingPathComponent(Repo.pocketTts.folderName) let encoderPath = repoDir.appendingPathComponent(ModelNames.PocketTTS.mimiEncoderFile) if FileManager.default.fileExists(atPath: encoderPath.path) { @@ -54,6 +210,11 @@ public enum PocketTtsResourceDownloader { return encoderPath } + // Make sure the parent directory exists — the user may not have + // downloaded any language pack yet. + try FileManager.default.createDirectory( + at: repoDir, withIntermediateDirectories: true) + logger.info("Downloading Mimi encoder for voice cloning...") try await downloadMimiEncoder(to: repoDir) @@ -74,36 +235,59 @@ public enum PocketTtsResourceDownloader { } /// Ensure constants (binary blobs + tokenizer) are available. - public static func ensureConstants(repoDirectory: URL) throws -> PocketTtsConstantsBundle { - try PocketTtsConstantsLoader.load(from: repoDirectory) + /// + /// - Parameter languageRoot: The directory returned by `ensureModels(...)`, + /// which contains the language-specific `constants_bin/`. + public static func ensureConstants(languageRoot: URL) throws -> PocketTtsConstantsBundle { + try PocketTtsConstantsLoader.load(from: languageRoot) } - /// Ensure voice conditioning data is available, downloading from HuggingFace if missing. + /// Ensure voice conditioning data for the given language is available, + /// downloading from HuggingFace if missing. + /// + /// - Parameters: + /// - voice: Voice name (e.g. `"alba"`, `"michael"`). + /// - language: Language pack the voice belongs to. Voice files are + /// per-language (same names, different acoustic embeddings). + /// - languageRoot: The directory returned by `ensureModels(language:)`. public static func ensureVoice( - _ voice: String, repoDirectory: URL + _ voice: String, + language: PocketTtsLanguage = .english, + languageRoot: URL ) async throws -> PocketTtsVoiceData { let sanitized = voice.filter { $0.isLetter || $0.isNumber || $0 == "_" } guard !sanitized.isEmpty else { throw PocketTTSError.processingFailed("Invalid voice name: \(voice)") } - let constantsDir = repoDirectory.appendingPathComponent(ModelNames.PocketTTS.constantsBinDir) + let constantsDir = languageRoot.appendingPathComponent(ModelNames.PocketTTS.constantsBinDir) let voiceFile = "\(sanitized)_audio_prompt.bin" let voiceURL = constantsDir.appendingPathComponent(voiceFile) if !FileManager.default.fileExists(atPath: voiceURL.path) { - logger.info("Downloading voice '\(sanitized)' from HuggingFace...") - let remotePath = "constants_bin/\(voiceFile)" + logger.info( + "Downloading voice '\(sanitized)' for \(language.rawValue) from HuggingFace...") + let remotePrefix: String + if let subdir = language.repoSubdirectory { + remotePrefix = "\(subdir)/" + } else { + remotePrefix = "" + } + let remotePath = "\(remotePrefix)constants_bin/\(voiceFile)" let remoteURL = try ModelRegistry.resolveModel(Repo.pocketTts.remotePath, remotePath) let data = try await AssetDownloader.fetchData( from: remoteURL, - description: "\(sanitized) voice prompt", + description: "\(sanitized) voice prompt (\(language.rawValue))", logger: logger ) + // Make sure the parent directory exists in case this is a fresh + // language pack that hasn't materialized constants_bin/ yet. + try FileManager.default.createDirectory( + at: constantsDir, withIntermediateDirectories: true) try data.write(to: voiceURL, options: [.atomic]) logger.info("Downloaded voice '\(sanitized)' (\(data.count / 1024) KB)") } - return try PocketTtsConstantsLoader.loadVoice(voice, from: repoDirectory) + return try PocketTtsConstantsLoader.loadVoice(voice, from: languageRoot) } // MARK: - Private diff --git a/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsLayerKeys.swift b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsLayerKeys.swift new file mode 100644 index 000000000..dc37702a7 --- /dev/null +++ b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsLayerKeys.swift @@ -0,0 +1,187 @@ +@preconcurrency import CoreML +import Foundation + +/// Discovered CoreML output names for one transformer model (cond_step or +/// flowlm_step). +/// +/// CoreML auto-generates output names during tracing (`new_cache_N_internal_tensor_assign_2`, +/// `var_NNN`) and the exact numeric suffixes differ between 6L and 24L packs. +/// Rather than hardcoding the names per pack, we scan the model's output +/// description at load time and group outputs by tensor shape: +/// +/// - `[2, 1, kvCacheMaxLen, 16, 64]` → KV cache (one per layer) +/// - `[1]` → position scalar (one per layer) +/// - `[1, 1, transformerDim]` → transformer hidden state (flowlm_step only) +/// - `[1, 1, 1]` → EOS logit (flowlm_step only) +/// +/// Within each group we order by the numeric suffix in the name. Cache names +/// follow the closed form `new_cache_{2*i+1}_internal_tensor_assign_2` for +/// layers 0..N-2 with the last layer being `new_cache_internal_tensor_assign_2` +/// (no number — sorted last). Position names use `var_NNN` with irregular +/// strides that nevertheless increase monotonically per layer. +struct PocketTtsLayerKeys: Sendable { + /// One cache output name per transformer layer, ordered by layer index. + let cacheKeys: [String] + /// One position output name per transformer layer, ordered by layer index. + let positionKeys: [String] + /// Hidden-state output name (flowlm_step only). `nil` for cond_step. + let transformerOut: String? + /// EOS logit output name (flowlm_step only). `nil` for cond_step. + let eosLogit: String? + + var layerCount: Int { cacheKeys.count } + + enum DiscoveryError: Error, LocalizedError { + case shapeMismatch(modelName: String, expectedLayers: Int, actualCaches: Int) + case missingFlowLMOutputs(modelName: String, hasTransformer: Bool, hasEos: Bool) + + var errorDescription: String? { + switch self { + case .shapeMismatch(let modelName, let expected, let actual): + return + "PocketTTS layer-key discovery on \(modelName): expected \(expected) cache outputs, found \(actual)" + case .missingFlowLMOutputs(let modelName, let hasTransformer, let hasEos): + return + "PocketTTS \(modelName) missing flowlm outputs (transformer=\(hasTransformer), eos=\(hasEos))" + } + } + } + + /// Discover the output keys for a `cond_step` or `flowlm_step` CoreML model. + /// + /// - Parameters: + /// - model: The compiled CoreML model. + /// - kind: Which model this is — affects whether transformer/eos + /// outputs are required. + /// - expectedLayers: Optional sanity check for the layer count. + static func discover( + from model: MLModel, + kind: ModelKind, + expectedLayers: Int? = nil, + modelName: String + ) throws -> PocketTtsLayerKeys { + let outputs = model.modelDescription.outputDescriptionsByName + + // Bucket outputs by shape. + var cacheCandidates: [String] = [] + var positionCandidates: [String] = [] + var transformerCandidate: String? + var eosCandidate: String? + + let cacheShape = [ + 2, 1, PocketTtsConstants.kvCacheMaxLen, 16, 64, + ] + let transformerShape = [1, 1, PocketTtsConstants.transformerDim] + let eosShape = [1, 1, 1] + let positionShape = [1] + + for (name, desc) in outputs { + guard let constraint = desc.multiArrayConstraint else { continue } + let shape = constraint.shape.map { $0.intValue } + + if shape == cacheShape { + cacheCandidates.append(name) + } else if shape == positionShape { + positionCandidates.append(name) + } else if shape == transformerShape { + transformerCandidate = name + } else if shape == eosShape { + eosCandidate = name + } + } + + // Sort caches by extracted numeric suffix; "new_cache_internal_..." + // (no number) sorts as "last" (largest layer index). + cacheCandidates.sort { lhs, rhs in + let li = cacheLayerIndex(from: lhs) ?? Int.max + let ri = cacheLayerIndex(from: rhs) ?? Int.max + if li != ri { return li < ri } + return lhs < rhs + } + + // Sort positions by trailing numeric suffix. + positionCandidates.sort { lhs, rhs in + let li = trailingNumber(in: lhs) ?? Int.max + let ri = trailingNumber(in: rhs) ?? Int.max + if li != ri { return li < ri } + return lhs < rhs + } + + if let expected = expectedLayers, cacheCandidates.count != expected { + throw DiscoveryError.shapeMismatch( + modelName: modelName, + expectedLayers: expected, + actualCaches: cacheCandidates.count + ) + } + + if positionCandidates.count != cacheCandidates.count { + throw DiscoveryError.shapeMismatch( + modelName: modelName, + expectedLayers: cacheCandidates.count, + actualCaches: positionCandidates.count + ) + } + + switch kind { + case .condStep: + return PocketTtsLayerKeys( + cacheKeys: cacheCandidates, + positionKeys: positionCandidates, + transformerOut: nil, + eosLogit: nil + ) + case .flowlmStep: + guard let transformerOut = transformerCandidate, let eosLogit = eosCandidate else { + throw DiscoveryError.missingFlowLMOutputs( + modelName: modelName, + hasTransformer: transformerCandidate != nil, + hasEos: eosCandidate != nil + ) + } + return PocketTtsLayerKeys( + cacheKeys: cacheCandidates, + positionKeys: positionCandidates, + transformerOut: transformerOut, + eosLogit: eosLogit + ) + } + } + + enum ModelKind { + case condStep + case flowlmStep + } + + // MARK: - Name parsing + + /// Extract the layer index from a cache output name. + /// + /// Pattern: + /// - `new_cache__internal_tensor_assign_2` → returns `(N - 1) / 2` + /// - `new_cache_internal_tensor_assign_2` → returns `nil` (sorts last) + private static func cacheLayerIndex(from name: String) -> Int? { + // Strip the "new_cache_" prefix, then take everything up to the next "_". + guard name.hasPrefix("new_cache_") else { return nil } + let after = name.dropFirst("new_cache_".count) + guard let underscore = after.firstIndex(of: "_") else { return nil } + let head = after[.. Int? { + var digits = "" + for char in name.reversed() { + if char.isNumber { + digits.append(char) + } else { + break + } + } + guard !digits.isEmpty else { return nil } + return Int(String(digits.reversed())) + } +} diff --git a/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsMimiSchema.swift b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsMimiSchema.swift new file mode 100644 index 000000000..236c205ff --- /dev/null +++ b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsMimiSchema.swift @@ -0,0 +1,182 @@ +@preconcurrency import CoreML +import Foundation + +/// Discovered Mimi decoder I/O schema. +/// +/// The Mimi decoder takes a `latent` input + N state tensors that carry the +/// streaming context across frames, and returns an audio waveform + N updated +/// state tensors. +/// +/// Two schema variants exist in the wild: +/// +/// - **v2 (semantic, mobius post-rename):** outputs are renamed to +/// `audio` + `_out` at conversion time, so discovery is a +/// trivial pairing. +/// +/// - **v1 (legacy, current HF FluidInference/pocket-tts-coreml English):** +/// outputs use CoreML's auto-generated `var_NNN` / `cast_NN` names. Inputs +/// are still semantic, so we fall back to a hand-curated input → output +/// table baked at conversion time. +/// +/// Discovery checks for the v2 convention first (zero hardcoded names) and +/// falls back to v1 only when the model lacks an `audio` output. +struct PocketTtsMimiSchema: Sendable { + + /// Output name carrying the audio waveform (`[1, 1, samplesPerFrame]`). + let audioOutputName: String + + /// Pairing of state input name → corresponding state output name. + /// Order is preserved from the model's input description so callers can + /// iterate deterministically when copying state forward. + let stateMapping: [(input: String, output: String)] + + /// Expected MLMultiArray shape for each state input (keyed by input name). + /// Used to zero-init the streaming state directly from the model's + /// description without depending on a sidecar manifest file. + let stateInputShapes: [String: [Int]] + + /// Expected MLMultiArray dtype for state inputs (uniform across the + /// model — all FP32 for v1, all FP16 for v2). + let stateInputDataType: MLMultiArrayDataType + + /// Expected MLMultiArray dtype for the `latent` input (matches state + /// dtype in practice, but kept separate for safety). + let latentDataType: MLMultiArrayDataType + + /// Names of all state inputs (ordered). + var stateInputNames: [String] { stateMapping.map { $0.input } } + + enum DiscoveryError: Error, LocalizedError { + case missingAudioOutput(modelName: String, candidates: [String]) + case missingStateOutput(inputName: String, expectedOutput: String) + + var errorDescription: String? { + switch self { + case .missingAudioOutput(let modelName, let candidates): + return + "PocketTTS \(modelName): could not find audio output (no `audio` output and no v1 fallback hit). Outputs seen: \(candidates)" + case .missingStateOutput(let input, let expected): + return + "PocketTTS mimi schema: state input `\(input)` has no matching output `\(expected)`" + } + } + } + + /// Discover the Mimi schema from a loaded MLModel. + static func discover(from model: MLModel) throws -> PocketTtsMimiSchema { + let inputs = model.modelDescription.inputDescriptionsByName + let outputs = model.modelDescription.outputDescriptionsByName + + // State inputs = every input except `latent`. + let stateInputNames = inputs.keys.filter { $0 != "latent" } + + // ── Path A: v2 semantic schema ────────────────────────────────── + if outputs["audio"] != nil { + var mapping: [(String, String)] = [] + var shapes: [String: [Int]] = [:] + var stateDtype: MLMultiArrayDataType = .float32 + for inputName in stateInputNames { + // Pass-through state outputs (the `*_first` scalars and + // zero-length `res*_conv1_prev` tensors) share an SSA value + // with their input parameter and cannot be safely renamed at + // conversion time — they keep the bare input name. All other + // state outputs follow the `_out` convention. + let suffixed = "\(inputName)_out" + let outputName: String + if outputs[suffixed] != nil { + outputName = suffixed + } else if outputs[inputName] != nil { + outputName = inputName // pass-through alias + } else { + throw DiscoveryError.missingStateOutput( + inputName: inputName, expectedOutput: suffixed) + } + mapping.append((inputName, outputName)) + if let constraint = inputs[inputName]?.multiArrayConstraint { + shapes[inputName] = constraint.shape.map { $0.intValue } + stateDtype = constraint.dataType + } + } + let latentDtype = inputs["latent"]?.multiArrayConstraint?.dataType ?? .float32 + // Stable order: by input name (deterministic across runs). + mapping.sort { $0.0 < $1.0 } + return PocketTtsMimiSchema( + audioOutputName: "audio", stateMapping: mapping, stateInputShapes: shapes, + stateInputDataType: stateDtype, latentDataType: latentDtype) + } + + // ── Path B: v1 legacy schema (HF FluidInference current English) ─ + // Pre-shipped models use auto-generated var_NNN / cast_NN names with + // a known mapping baked at conversion time. + if let v1 = legacyV1Schema, outputs[v1.audioOutputName] != nil { + // Verify all expected state outputs exist; if any are missing, + // fall through to error so we surface schema drift loudly. + var verified: [(String, String)] = [] + var shapes: [String: [Int]] = [:] + var stateDtype: MLMultiArrayDataType = .float32 + for (inp, out) in v1.stateMapping { + guard inputs[inp] != nil else { continue } // input dropped + guard outputs[out] != nil else { + throw DiscoveryError.missingStateOutput(inputName: inp, expectedOutput: out) + } + verified.append((inp, out)) + if let constraint = inputs[inp]?.multiArrayConstraint { + shapes[inp] = constraint.shape.map { $0.intValue } + stateDtype = constraint.dataType + } + } + let latentDtype = inputs["latent"]?.multiArrayConstraint?.dataType ?? .float32 + return PocketTtsMimiSchema( + audioOutputName: v1.audioOutputName, stateMapping: verified, + stateInputShapes: shapes, stateInputDataType: stateDtype, + latentDataType: latentDtype) + } + + throw DiscoveryError.missingAudioOutput( + modelName: "mimi_decoder", candidates: Array(outputs.keys).sorted()) + } + + // MARK: - Legacy v1 fallback (current HF English pack) + + /// Legacy v1 mapping kept for backward compatibility with the + /// FluidInference/pocket-tts-coreml English pack on HF (which still uses + /// `var_NNN`/`cast_NN` output names from before semantic renaming was + /// added to the converter). + /// + /// Once the v2 (FP16, semantic) pack is uploaded and the cache invalidated, + /// this fallback can be removed. + private static let legacyV1Schema: PocketTtsMimiSchema? = PocketTtsMimiSchema( + audioOutputName: "var_821", + stateMapping: [ + ("upsample_partial", "var_82"), + ("attn0_cache", "var_262"), + ("attn0_offset", "var_840"), + ("attn0_end_offset", "new_end_offset_1"), + ("attn1_cache", "var_479"), + ("attn1_offset", "var_843"), + ("attn1_end_offset", "new_end_offset"), + ("conv0_prev", "var_607"), + ("conv0_first", "conv0_first"), + ("convtr0_partial", "var_634"), + ("res0_conv0_prev", "var_660"), + ("res0_conv0_first", "res0_conv0_first"), + ("res0_conv1_prev", "res0_conv1_prev"), + ("res0_conv1_first", "res0_conv1_first"), + ("convtr1_partial", "var_700"), + ("res1_conv0_prev", "var_726"), + ("res1_conv0_first", "res1_conv0_first"), + ("res1_conv1_prev", "res1_conv1_prev"), + ("res1_conv1_first", "res1_conv1_first"), + ("convtr2_partial", "var_766"), + ("res2_conv0_prev", "var_792"), + ("res2_conv0_first", "res2_conv0_first"), + ("res2_conv1_prev", "res2_conv1_prev"), + ("res2_conv1_first", "res2_conv1_first"), + ("conv_final_prev", "var_824"), + ("conv_final_first", "conv_final_first"), + ], + stateInputShapes: [:], // populated from model description in `discover` + stateInputDataType: .float32, + latentDataType: .float32 + ) +} diff --git a/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsModelStore.swift b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsModelStore.swift index 3f93f10a3..af48d3458 100644 --- a/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsModelStore.swift +++ b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsModelStore.swift @@ -7,6 +7,9 @@ import OSLog /// Manages loading and storing of the four CoreML models /// (cond_step, flowlm_step, flow_decoder, mimi_decoder), /// the binary constants bundle, and voice conditioning data. +/// +/// A store is bound to a single `PocketTtsLanguage` for its lifetime; switch +/// languages by creating a new store/manager. public actor PocketTtsModelStore { private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "PocketTtsModelStore") @@ -18,12 +21,29 @@ public actor PocketTtsModelStore { private var mimiEncoderModel: MLModel? private var constantsBundle: PocketTtsConstantsBundle? private var voiceCache: [String: PocketTtsVoiceData] = [:] - private var repoDirectory: URL? + private var languageRootDirectory: URL? + private var condLayerKeys: PocketTtsLayerKeys? + private var flowlmLayerKeys: PocketTtsLayerKeys? + private var mimiSchema: PocketTtsMimiSchema? private let directory: URL? + public let language: PocketTtsLanguage + public let quantization: PocketTtsQuantization - /// - Parameter directory: Optional override for the base cache directory. - /// When `nil`, uses the default platform cache location. - public init(directory: URL? = nil) { + /// - Parameters: + /// - language: Which upstream language pack to load. Defaults to + /// `.english` for backward compatibility. + /// - quantization: Per-submodel precision (fp16/int8). Defaults to + /// all-fp16. Each submodel can be independently swapped — see + /// ``PocketTtsQuantization`` for presets and quality tradeoffs. + /// - directory: Optional override for the base cache directory. When + /// `nil`, uses the default platform cache location. + public init( + language: PocketTtsLanguage = .english, + quantization: PocketTtsQuantization = .allFp16, + directory: URL? = nil + ) { + self.language = language + self.quantization = quantization self.directory = directory } @@ -31,34 +51,66 @@ public actor PocketTtsModelStore { public func loadIfNeeded() async throws { guard condStepModel == nil else { return } - let repoDir = try await PocketTtsResourceDownloader.ensureModels(directory: directory) - self.repoDirectory = repoDir + let resolved = try await PocketTtsResourceDownloader.ensureResolvedModels( + language: language, + quantization: quantization, + directory: directory + ) + self.languageRootDirectory = resolved.languageRoot - logger.info("Loading PocketTTS CoreML models...") + logger.info( + "Loading PocketTTS CoreML models (language=\(self.language.rawValue), quant=cond:\(self.quantization.condStep.rawValue),flowlm:\(self.quantization.flowlmStep.rawValue),flow:\(self.quantization.flowDecoder.rawValue),mimi:\(self.quantization.mimiDecoder.rawValue))..." + ) - // Use CPU+GPU for all models to avoid ANE float16 precision loss. - // The ANE processes in native float16, which causes audible artifacts - // in the Mimi decoder's streaming state feedback loop and may degrade - // quality in the other models. CPU/GPU compute in float32 matches the - // Python reference implementation. - let config = MLModelConfiguration() - config.computeUnits = .cpuAndGPU + // Per-model compute units — profiled on Apple Silicon, FP16 mlpackages: + // + // model units why + // ──────────── ─────────────── ───────────────────────────────────────── + // cond_step .cpuAndGPU ANE ≈ GPU (no benefit), and ANE prefill + // occasionally hits MPSGraph rank-5/zero- + // shape assert. GPU path is robust. + // flowlm_step .all 1.97× faster on ANE than GPU; this is + // the autoregressive bottleneck (called + // once per output frame). + // flow_decoder .all Tiny model called 8× per frame; + // CPU+NE/ALL are both fast. + // mimi_decoder .cpuOnly GPU dispatch overhead exceeds GPU gain + // on this small streaming-conv model + // (~1.74× faster on CPU). Cannot use ANE: + // segfaults on 64-byte stride misalign in + // some state tensors. + // + // FP32 mlpackages: CoreML still chooses ANE for ANE-eligible ops at + // FP16 internally; precision loss is bounded by the ops dispatched + // (autoregressive softmax/lm-head still run at FP32 on CPU/GPU). + let condConfig = MLModelConfiguration() + condConfig.computeUnits = .cpuAndGPU + let flowlmConfig = MLModelConfiguration() + flowlmConfig.computeUnits = .all + let flowConfig = MLModelConfiguration() + flowConfig.computeUnits = .all + let mimiConfig = MLModelConfiguration() + mimiConfig.computeUnits = .cpuOnly let loadStart = Date() - let modelFiles = [ - ModelNames.PocketTTS.condStepFile, - ModelNames.PocketTTS.flowlmStepFile, - ModelNames.PocketTTS.flowDecoderFile, - ModelNames.PocketTTS.mimiDecoderFile, + // Each submodel may live at the language root (fp16) or under + // `languages//int8/` (int8). The resolver above gives us + // pre-validated URLs. + let modelLoads: [(URL, MLModelConfiguration, PocketTtsModelPrecision)] = [ + (resolved.condStepURL, condConfig, quantization.condStep), + (resolved.flowlmStepURL, flowlmConfig, quantization.flowlmStep), + (resolved.flowDecoderURL, flowConfig, quantization.flowDecoder), + (resolved.mimiDecoderURL, mimiConfig, quantization.mimiDecoder), ] var loadedModels: [MLModel] = [] - for file in modelFiles { - let modelURL = repoDir.appendingPathComponent(file) + for (modelURL, config, precision) in modelLoads { let model = try MLModel(contentsOf: modelURL, configuration: config) loadedModels.append(model) - logger.info("Loaded \(file)") + logger.info( + "Loaded \(modelURL.lastPathComponent) [\(precision.rawValue)] (units=\(config.computeUnits.rawValue))" + ) } condStepModel = loadedModels[0] @@ -66,12 +118,36 @@ public actor PocketTtsModelStore { flowDecoderModel = loadedModels[2] mimiDecoderModel = loadedModels[3] + // Discover per-model output names. Names differ between 6L and 24L + // packs because CoreML auto-generates them during tracing. + let expectedLayers = language.transformerLayers + condLayerKeys = try PocketTtsLayerKeys.discover( + from: loadedModels[0], + kind: .condStep, + expectedLayers: expectedLayers, + modelName: "cond_step" + ) + flowlmLayerKeys = try PocketTtsLayerKeys.discover( + from: loadedModels[1], + kind: .flowlmStep, + expectedLayers: expectedLayers, + modelName: "flowlm_step" + ) + + // Discover Mimi I/O schema (semantic v2 names preferred, falls back to + // hardcoded v1 names for the legacy English pack). + mimiSchema = try PocketTtsMimiSchema.discover(from: loadedModels[3]) + logger.info( + "Mimi schema: audio=\(self.mimiSchema?.audioOutputName ?? "?"), states=\(self.mimiSchema?.stateMapping.count ?? 0)" + ) + let elapsed = Date().timeIntervalSince(loadStart) logger.info("All PocketTTS models loaded in \(String(format: "%.2f", elapsed))s") - // Load constants + // Load constants from the fp16 language root (constants_bin/ is not + // duplicated in the int8 tree). constantsBundle = try PocketTtsResourceDownloader.ensureConstants( - repoDirectory: repoDir) + languageRoot: resolved.languageRoot) logger.info("PocketTTS constants loaded") } @@ -115,9 +191,35 @@ public actor PocketTtsModelStore { return bundle } - /// The repository directory containing models and constants. + /// Discovered output names for the cond_step transformer model. + func condStepLayerKeys() throws -> PocketTtsLayerKeys { + guard let keys = condLayerKeys else { + throw PocketTTSError.modelNotFound("PocketTTS cond_step layer keys not discovered") + } + return keys + } + + /// Discovered output names for the flowlm_step transformer model. + func flowLMStepLayerKeys() throws -> PocketTtsLayerKeys { + guard let keys = flowlmLayerKeys else { + throw PocketTTSError.modelNotFound("PocketTTS flowlm_step layer keys not discovered") + } + return keys + } + + /// Discovered I/O schema for the Mimi decoder model. + func mimiSchemaKeys() throws -> PocketTtsMimiSchema { + guard let schema = mimiSchema else { + throw PocketTTSError.modelNotFound("PocketTTS mimi_decoder schema not discovered") + } + return schema + } + + /// The language root directory (legacy repo root for English, or + /// `/v2/` otherwise) — contains the four model files, + /// `constants_bin/`, and is the right base for `loadMimiInitialState`. public func repoDir() throws -> URL { - guard let dir = repoDirectory else { + guard let dir = languageRootDirectory else { throw PocketTTSError.modelNotFound("PocketTTS repository not loaded") } return dir @@ -128,10 +230,14 @@ public actor PocketTtsModelStore { if let cached = voiceCache[voice] { return cached } - guard let repoDir = repoDirectory else { + guard let languageRoot = languageRootDirectory else { throw PocketTTSError.modelNotFound("PocketTTS repository not loaded") } - let data = try await PocketTtsResourceDownloader.ensureVoice(voice, repoDirectory: repoDir) + let data = try await PocketTtsResourceDownloader.ensureVoice( + voice, + language: language, + languageRoot: languageRoot + ) voiceCache[voice] = data return data } @@ -140,18 +246,15 @@ public actor PocketTtsModelStore { /// Load the Mimi encoder model for voice cloning (lazy, on-demand). /// - /// Downloads the model from HuggingFace if not already cached. + /// Downloads the model from HuggingFace if not already cached. The Mimi + /// encoder is shared across all language packs and lives at the legacy + /// repo root. public func loadMimiEncoderIfNeeded() async throws { guard mimiEncoderModel == nil else { return } // Ensure the mimi_encoder is downloaded (downloads if needed) let modelURL = try await PocketTtsResourceDownloader.ensureMimiEncoder(directory: directory) - // Update repoDirectory if not set - if repoDirectory == nil { - repoDirectory = modelURL.deletingLastPathComponent() - } - let config = MLModelConfiguration() config.computeUnits = .cpuAndGPU @@ -174,8 +277,18 @@ public actor PocketTtsModelStore { /// Check if the Mimi encoder model is available. public func isMimiEncoderAvailable() -> Bool { - guard let repoDir = repoDirectory else { return false } - let modelURL = repoDir.appendingPathComponent(ModelNames.PocketTTS.mimiEncoderFile) + // The Mimi encoder always lives at the repo root regardless of the + // currently selected language pack. + let repoRoot: URL + if let langRoot = languageRootDirectory { + repoRoot = + (language.repoSubdirectory == nil) + ? langRoot + : langRoot.deletingLastPathComponent().deletingLastPathComponent() + } else { + return false + } + let modelURL = repoRoot.appendingPathComponent(ModelNames.PocketTTS.mimiEncoderFile) return FileManager.default.fileExists(atPath: modelURL.path) } diff --git a/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsSession.swift b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsSession.swift index b96f12dc7..e054be469 100644 --- a/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsSession.swift +++ b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsSession.swift @@ -58,6 +58,9 @@ public actor PocketTtsSession { private let stepModel: MLModel private let flowModel: MLModel private let mimiModel: MLModel + private let mimiSchema: PocketTtsMimiSchema + private let condLayerKeys: PocketTtsLayerKeys + private let flowlmLayerKeys: PocketTtsLayerKeys // Persistent state private let voiceKVSnapshot: PocketTtsSynthesizer.KVCacheState @@ -80,6 +83,9 @@ public actor PocketTtsSession { stepModel: MLModel, flowModel: MLModel, mimiModel: MLModel, + mimiSchema: PocketTtsMimiSchema, + condLayerKeys: PocketTtsLayerKeys, + flowlmLayerKeys: PocketTtsLayerKeys, bosEmb: MLMultiArray, temperature: Float, seed: UInt64 @@ -91,6 +97,9 @@ public actor PocketTtsSession { self.stepModel = stepModel self.flowModel = flowModel self.mimiModel = mimiModel + self.mimiSchema = mimiSchema + self.condLayerKeys = condLayerKeys + self.flowlmLayerKeys = flowlmLayerKeys self.bosEmb = bosEmb self.temperature = temperature self.rng = SeededRNG(seed: seed) @@ -172,7 +181,8 @@ public actor PocketTtsSession { // Clone voice KV snapshot and prefill text tokens only var kvState = try PocketTtsSynthesizer.cloneKVCacheState(voiceKVSnapshot) kvState = try await PocketTtsSynthesizer.prefillKVCacheText( - state: kvState, textEmbeddings: textEmbeddings, model: condModel + state: kvState, textEmbeddings: textEmbeddings, model: condModel, + layerKeys: condLayerKeys ) // Generation loop @@ -190,7 +200,8 @@ public actor PocketTtsSession { sequence: sequence, bosEmb: bosEmb, state: &localKV, - model: stepModel + model: stepModel, + layerKeys: flowlmLayerKeys ) kvState = localKV @@ -219,7 +230,8 @@ public actor PocketTtsSession { let frameSamples = try await PocketTtsSynthesizer.runMimiDecoder( latent: latent, state: &localMimi, - model: mimiModel + model: mimiModel, + schema: mimiSchema ) mimiState = localMimi diff --git a/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsSynthesizer+KVCache.swift b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsSynthesizer+KVCache.swift index f41d55347..b9716f394 100644 --- a/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsSynthesizer+KVCache.swift +++ b/Sources/FluidAudio/TTS/PocketTTS/Pipeline/PocketTtsSynthesizer+KVCache.swift @@ -9,20 +9,24 @@ extension PocketTtsSynthesizer { /// for every processed token. This avoids recomputing K/V for past tokens — /// each new step only computes its own K/V, then reads all cached K/V via attention. struct KVCacheState { - /// 6 KV cache arrays, each shaped `[2, 1, kvCacheMaxLen, 16, 64]`: + /// `N` KV cache arrays (one per transformer layer), each shaped + /// `[2, 1, kvCacheMaxLen, 16, 64]`: /// - `2`: K and V tensors (index 0 = keys, index 1 = values) /// - `1`: batch size /// - `kvCacheMaxLen` (512): pre-allocated position slots /// - `16`: attention heads /// - `64`: dims per head (16 × 64 = 1024 total) + /// + /// `N` is 6 for the legacy English / 6L packs, and 24 for `*_24l` + /// packs. var caches: [MLMultiArray] - /// 6 position counters (one per layer) tracking the next write slot in the cache. + /// `N` position counters (one per layer) tracking the next write slot + /// in each cache. var positions: [MLMultiArray] } /// Create an empty KV cache state (all zeros, positions at 0). - static func emptyKVCacheState() throws -> KVCacheState { - let layers = PocketTtsConstants.kvCacheLayers + static func emptyKVCacheState(layers: Int) throws -> KVCacheState { let shape: [NSNumber] = [ 2, 1, NSNumber(value: PocketTtsConstants.kvCacheMaxLen), 16, 64, ] @@ -96,13 +100,15 @@ extension PocketTtsSynthesizer { static func runCondStep( conditioning: MLMultiArray, state: inout KVCacheState, - model: MLModel + model: MLModel, + layerKeys: PocketTtsLayerKeys ) async throws { + let layers = layerKeys.layerCount var inputDict: [String: Any] = [ "conditioning": conditioning ] - for i in 0.. KVCacheState { var state = state let dim = PocketTtsConstants.embeddingDim @@ -145,7 +152,8 @@ extension PocketTtsSynthesizer { offset: tokenIdx * dim, dim: dim ) - try await runCondStep(conditioning: token, state: &state, model: model) + try await runCondStep( + conditioning: token, state: &state, model: model, layerKeys: layerKeys) } return state @@ -158,14 +166,16 @@ extension PocketTtsSynthesizer { static func prefillKVCacheText( state: KVCacheState, textEmbeddings: [[Float]], - model: MLModel + model: MLModel, + layerKeys: PocketTtsLayerKeys ) async throws -> KVCacheState { var state = state let dim = PocketTtsConstants.embeddingDim for embedding in textEmbeddings { let token = try createConditioningToken(from: embedding, offset: 0, dim: dim) - try await runCondStep(conditioning: token, state: &state, model: model) + try await runCondStep( + conditioning: token, state: &state, model: model, layerKeys: layerKeys) } return state @@ -180,14 +190,15 @@ extension PocketTtsSynthesizer { static func prefillKVCache( voiceData: PocketTtsVoiceData, textEmbeddings: [[Float]], - model: MLModel + model: MLModel, + layerKeys: PocketTtsLayerKeys ) async throws -> KVCacheState { - let emptyState = try emptyKVCacheState() + let emptyState = try emptyKVCacheState(layers: layerKeys.layerCount) var state = try await prefillKVCacheVoice( - state: emptyState, voiceData: voiceData, model: model + state: emptyState, voiceData: voiceData, model: model, layerKeys: layerKeys ) state = try await prefillKVCacheText( - state: state, textEmbeddings: textEmbeddings, model: model + state: state, textEmbeddings: textEmbeddings, model: model, layerKeys: layerKeys ) let finalPos = state.positions[0][0].floatValue @@ -223,14 +234,23 @@ extension PocketTtsSynthesizer { sequence: MLMultiArray, bosEmb: MLMultiArray, state: inout KVCacheState, - model: MLModel + model: MLModel, + layerKeys: PocketTtsLayerKeys ) async throws -> (transformerOut: MLMultiArray, eosLogit: Float) { + guard let transformerKey = layerKeys.transformerOut, let eosKey = layerKeys.eosLogit + else { + throw PocketTTSError.processingFailed( + "flowlm_step layer keys missing transformer/eos outputs") + } + + let layers = layerKeys.layerCount + var inputDict: [String: Any] = [ "sequence": sequence, "bos_emb": bosEmb, ] - for i in 0.. MimiState { - let constantsDir = repoDirectory.appendingPathComponent(ModelNames.PocketTTS.constantsBinDir) - let stateDir = constantsDir.appendingPathComponent("mimi_init_state") - let manifestURL = constantsDir.appendingPathComponent("manifest.json") - - // Parse manifest for mimi_init_state shapes - let manifestData = try Data(contentsOf: manifestURL) - guard let manifest = try JSONSerialization.jsonObject(with: manifestData) as? [String: Any], - let mimiManifest = manifest["mimi_init_state"] as? [String: Any] - else { - throw PocketTTSError.processingFailed("Failed to parse mimi_init_state from manifest.json") - } - + /// Mimi's streaming state is "all-zero past" with two semantic exceptions: + /// the `*_first` boolean scalars must be 1.0 on the very first frame so + /// the decoder takes the cold-start convolution path. All other state + /// tensors (caches, partials, offsets) are zero-initialized. + /// + /// This replaces the previous manifest-driven `.bin` loader. The shapes + /// come from the CoreML model description itself, so v1 and v2 packs + /// (which differ in mimi attention cache layout) work uniformly. + static func loadMimiInitialState(schema: PocketTtsMimiSchema) throws -> MimiState { var tensors: [String: MLMultiArray] = [:] + let dtype = schema.stateInputDataType + let elementSize = mimiElementSize(for: dtype) - for (name, info) in mimiManifest { - guard let infoDict = info as? [String: Any], - let shapeArray = infoDict["shape"] as? [Int], - let byteCount = infoDict["bytes"] as? Int - else { - continue + for (inputName, _) in schema.stateMapping { + guard let shape = schema.stateInputShapes[inputName], !shape.isEmpty else { + throw PocketTTSError.processingFailed( + "Mimi state input `\(inputName)` has no shape in schema") } - let shape = shapeArray.map { NSNumber(value: $0) } - let array = try MLMultiArray(shape: shape, dataType: .float32) - - // Some tensors (e.g., res{0,1,2}_conv1_prev) have zero-length shapes - // and are empty pass-throughs — skip loading binary data for those. - if byteCount > 0 && !shapeArray.contains(0) { - let binURL = stateDir.appendingPathComponent("\(name).bin") - let data = try Data(contentsOf: binURL) - let floatCount = byteCount / MemoryLayout.size - let dstPtr = array.dataPointer.bindMemory(to: Float.self, capacity: floatCount) - data.withUnsafeBytes { rawBuffer in - let srcPtr = rawBuffer.bindMemory(to: Float.self) - dstPtr.update(from: srcPtr.baseAddress!, count: floatCount) + let totalCount = shape.reduce(1, *) + let nsShape = shape.map { NSNumber(value: $0) } + let array: MLMultiArray + if totalCount == 0 { + // CoreML's MLE5 input binder rejects buffers with NULL data + // pointers (which is what MLMultiArray returns for zero-element + // shapes like `[1, 128, 0]`). Allocate a 1-byte sentinel buffer + // and hand it to MLMultiArray via the dataPointer initializer + // so the model gets a valid non-NULL pointer. + let sentinel = UnsafeMutableRawPointer.allocate( + byteCount: max(elementSize, 1), alignment: 64) + memset(sentinel, 0, max(elementSize, 1)) + let strides = mimiContiguousStrides(for: shape).map { NSNumber(value: $0) } + array = try MLMultiArray( + dataPointer: sentinel, + shape: nsShape, + dataType: dtype, + strides: strides, + deallocator: { ptr in ptr.deallocate() }) + } else { + array = try MLMultiArray(shape: nsShape, dataType: dtype) + memset(array.dataPointer, 0, totalCount * elementSize) + // `*_first` scalars signal the first-frame cold-start path. + if inputName.hasSuffix("_first") { + array[0] = NSNumber(value: Float(1)) } } - tensors[name] = array + tensors[inputName] = array } - // Ensure offset scalars exist - for key in ["attn0_offset", "attn0_end_offset", "attn1_offset", "attn1_end_offset"] { - if tensors[key] == nil { - let scalar = try MLMultiArray(shape: [1], dataType: .float32) - scalar[0] = NSNumber(value: Float(0)) - tensors[key] = scalar - } + return MimiState(tensors: tensors) + } + + /// Byte size of one element for the given MLMultiArray dtype. + private static func mimiElementSize(for dtype: MLMultiArrayDataType) -> Int { + switch dtype { + case .float16: return MemoryLayout.size + case .float32: return MemoryLayout.size + case .double: return MemoryLayout.size + case .int32: return MemoryLayout.size + @unknown default: return MemoryLayout.size } + } - return MimiState(tensors: tensors) + /// Row-major (C-contiguous) strides for the given shape, in elements. + private static func mimiContiguousStrides(for shape: [Int]) -> [Int] { + var strides = Array(repeating: 1, count: shape.count) + for i in stride(from: shape.count - 2, through: 0, by: -1) { + strides[i] = strides[i + 1] * max(shape[i + 1], 1) + } + return strides } /// Clone a Mimi state for independent use. @@ -98,22 +116,36 @@ extension PocketTtsSynthesizer { /// /// - Parameters: /// - latent: The raw latent vector, shape [32]. - /// - state: The streaming state (26 tensors), modified in place. + /// - state: The streaming state (24 or 26 tensors depending on + /// conversion vintage), modified in place. /// - model: The Mimi CoreML model. + /// - schema: I/O schema discovered at model-load time (audio output + /// name + state input ↔ output mapping). /// - Returns: Audio samples for this frame (1920 samples = 80ms at 24kHz). static func runMimiDecoder( latent: [Float], state: inout MimiState, - model: MLModel + model: MLModel, + schema: PocketTtsMimiSchema ) async throws -> [Float] { - // Create latent input: [1, 32] + // Create latent input: [1, 32] at the dtype the model expects. let latentDim = PocketTtsConstants.latentDim let latentArray = try MLMultiArray( - shape: [1, NSNumber(value: latentDim)], dataType: .float32) - let latentPtr = latentArray.dataPointer.bindMemory(to: Float.self, capacity: latentDim) - latent.withUnsafeBufferPointer { buffer in - guard let base = buffer.baseAddress else { return } - latentPtr.update(from: base, count: latentDim) + shape: [1, NSNumber(value: latentDim)], dataType: schema.latentDataType) + switch schema.latentDataType { + case .float32: + let dst = latentArray.dataPointer.bindMemory(to: Float.self, capacity: latentDim) + latent.withUnsafeBufferPointer { buf in + guard let base = buf.baseAddress else { return } + dst.update(from: base, count: latentDim) + } + case .float16: + let dst = latentArray.dataPointer.bindMemory(to: Float16.self, capacity: latentDim) + for i in 0..