diff --git a/Documentation/README.md b/Documentation/README.md index 540e33a06..135c833e1 100644 --- a/Documentation/README.md +++ b/Documentation/README.md @@ -40,6 +40,7 @@ - [Kokoro](TTS/Kokoro.md) - [Kokoro ANE (7-stage)](TTS/KokoroAne.md) - [PocketTTS](TTS/PocketTTS.md) +- [CosyVoice3 (Mandarin, beta)](TTS/CosyVoice3.md) - [SSML](TTS/SSML.md) - [Voice Quality Comparison](TTS/voice-quality.md) diff --git a/Documentation/TTS/CosyVoice3.md b/Documentation/TTS/CosyVoice3.md new file mode 100644 index 000000000..7308e2084 --- /dev/null +++ b/Documentation/TTS/CosyVoice3.md @@ -0,0 +1,246 @@ +# CosyVoice3 Swift Inference + +Mandarin zero-shot voice cloning via Qwen2 LM + CFM Flow + HiFT vocoder, +running on CoreML. + +> ⚠️ **Beta / experimental.** End-to-end synthesis is currently slow on +> Apple Silicon — RTFx < 1.0 typical, several seconds of latency for +> short Mandarin utterances. The slowdown is partly the Flow CFM stage +> (fp32, CPU-or-GPU only because fp16 + ANE produces NaNs through the +> fused `layer_norm` — CoreMLTools limitation, tracked upstream) and +> partly HiFT sinegen / windowing ops that fall back to CPU. May be a +> model issue, may be recoverable through better conversion. Treat +> performance numbers as preliminary; the Swift API, model layout, and +> prompt-asset format may change in subsequent releases without +> deprecation aliases. + +## Files + +| File | Role | +|------|------| +| `CosyVoice3TtsManager.swift` | Public actor — `initialize()`, `synthesize()`, `synthesizeFromFixture()`, `loadVoice()`, `downloadAndCreate()` | +| `CosyVoice3Models.swift` | The 4 CoreML model handles (prefill, decode, flow, hift) | +| `Assets/CosyVoice3ModelStore.swift` | Loads + compiles the four mlpackages, probes flat / nested layouts | +| `Assets/CosyVoice3ResourceDownloader.swift` | HuggingFace pull for `FluidInference/CosyVoice3-0.5B-coreml` | +| `Pipeline/Synthesize/CosyVoice3Synthesizer.swift` | Actor — prefill → decode loop → Flow → HiFT | +| `Pipeline/Synthesize/CosyVoice3RasSampler.swift` | top-p / top-k / repetition mask, seed-tokens bypass | +| `Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift` | mmap of 6761×896 fp16 speech-embedding table (12 MB) | +| `Pipeline/Synthesize/CosyVoice3Types.swift` | `CosyVoice3SynthesisOptions`, `CosyVoice3SynthesisResult`, `CosyVoice3ParityOptions` | +| `Pipeline/Preprocess/CosyVoice3TextFrontend.swift` | Special-token splitting + `lm_input_embeds` assembly | +| `Pipeline/Preprocess/Qwen2BpeTokenizer.swift` | tiktoken-compatible byte-level BPE, 151 936 vocab (incl. fileprivate `ByteEncoder` 188-symbol byte→unicode shim) | +| `Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift` | mmap of 151 936×896 fp16 text embedding table | +| `Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift` | Minimal regex-free port of `frontend_utils.py` | +| `Pipeline/Preprocess/CosyVoice3PromptMel.swift` | 24 kHz 80-bin log-mel matching `matcha audio.py` | +| `Pipeline/Preprocess/CosyVoice3PromptAssets.swift` | Voice-prompt bundle DTO (precomputed IDs / mel / spk-emb) | +| `Pipeline/Preprocess/CosyVoice3FrontendFixture.swift` | Phase 1 parity-fixture loader | +| `CosyVoice3Constants.swift` | Stop-token range, hidden dim, frame counts, etc. | +| `Shared/SafetensorsReader.swift` | ~170 LoC pure-Swift mmap + fp16/fp32/i32 accessors | + +## Call Flow + +``` +CosyVoice3TtsManager.synthesize(text:promptAssets:options:) + | + v +CosyVoice3TextFrontend.assembleLmInput(text:promptAssets:) + | + |-- normalizeText() split on <|endofprompt|>, replace_blank, etc. + |-- Qwen2BpeTokenizer.encode byte-level BPE → token IDs + |-- text_embedding lookup 151 936×896 fp16 mmap → [N_text, 896] + |-- speech_embedding lookup 6761×896 fp16 mmap → [N_speech, 896] + |-- concat([SOS, text, TASK, prompt_speech_ids]) → lm_input_embeds + | + v +CosyVoice3Synthesizer.synthesize(lm_input_embeds:promptAssets:) + | + |-- runPrefill() Qwen2 24L prefill, T <= 256 + | |-- in: lm_input_embeds, attn_mask + | |-- out: logits[1,T,6761], kv_cache[24,1,2,768,64] fp16 + | + |-- DECODE LOOP (until stop-range hit or maxNewTokens): + | | + | |-- runDecodeStep() takes prev token + cached KV + | | |-- in: token_id, kv_cache (in-place state) + | | |-- out: logits[1,1,6761] + | | + | |-- RasSampler.sample() top-p/top-k/repetition + seed-tokens bypass + | |-- if topId in stopRange (6561...6760): break + | |-- decoded.append(topId) + | + |-- runFlow() CFM 10-step ODE, conditional on prompt mel + spk_emb + | |-- in: decoded[N], prompt_mel, spk_embedding + | |-- out: full_mel[1, 80, M] fp32 + | + |-- runHiFT() vocoder, chunk-packed (T<=500 frames) + | |-- in: full_mel slice from newMelStart..newMelStart+newMelFrames + | |-- out: audio samples [N*hop_len] @ 24 kHz + | + |-- concatenate chunks → CosyVoice3SynthesisResult.samples +``` + +## Public API + +```swift +import FluidAudio + +// One-shot creation that downloads everything to ~/.cache/fluidaudio/ +let manager = try await CosyVoice3TtsManager.downloadAndCreate( + computeUnits: .cpuAndNeuralEngine +) +try await manager.initialize() + +// Load a voice prompt bundle (precomputed by mobius/.../bootstrap_aishell3_voices.py) +let voice = try CosyVoice3PromptAssets.load(from: voiceBundleURL) + +let result = try await manager.synthesize( + text: "希望你以后能够做的比我还好用", + promptAssets: voice, + options: CosyVoice3SynthesisOptions(maxNewTokens: 1024, seed: 42) +) +// result.samples : [Float] (mono fp32, 24 kHz) +// result.sampleRate : 24000 +``` + +`CosyVoice3SynthesisOptions`: + +| Field | Default | Notes | +|---|---|---| +| `maxNewTokens` | `nil` (cap = 1024) | Hard ceiling on speech-token count | +| `seed` | 42 | Drives the RAS sampler RNG; reproducible runs | + +`CosyVoice3SynthesisResult`: + +| Field | Type | Notes | +|---|---|---| +| `samples` | `[Float]` | mono, fp32, range ~[-1.0, 1.0] | +| `sampleRate` | `Int` | always 24000 | +| `generatedTokenCount` | `Int` | tokens before EOS | +| `decodedTokens` | `[Int32]` | full speech token sequence (debug) | + +## Key State + +### KV cache (`kv_cache[24, 1, 2, 768, 64]` fp16) +- 24 transformer layers × `[K,V]` × heads × dim, packed into one `MLState`-style + `MLMultiArray` that the prefill produces and the decode loop both reads + and overwrites in-place. +- Reset per `synthesize()` call. + +### Prompt assets (`CosyVoice3PromptAssets`) +- `promptText` — Mandarin reference text (must contain `<|endofprompt|>`). +- `promptSpeechIds: [Int32]` — pre-tokenized speech IDs from the + SpeechTokenizerV3 mlpackage (computed offline, reused across calls). +- `promptMel: [Float]`, `promptMelFrames` — 80-bin log-mel of the reference + audio at 24 kHz. +- `spkEmbedding: [Float]` — 192-dim speaker embedding from CAMPPlus. + +Bundles are produced by +`mobius/models/tts/cosyvoice3/coreml/verify/bootstrap_aishell3_voices.py` +or `extract_voice_prompt.py` for arbitrary speakers. + +## CoreML details + +- **Compute units:** caller chooses (`.cpuAndNeuralEngine` works for + prefill + decode + HiFT). Flow is forced to `.cpuAndGPU` regardless — + fp32 graph, ANE NaNs through the fused `layer_norm`. +- All four mlpackages compiled `.mlpackage → .mlmodelc` on first load and + cached on disk under `~/.cache/fluidaudio/Models/cosyvoice3/`. +- `CosyVoice3ModelStore` is an actor; `CosyVoice3Synthesizer` is an + actor. `CosyVoice3Models` (the four-tuple) conforms to `Sendable` via + `@preconcurrency import CoreML`, matching the existing `TtsModels` + pattern. + +## Stop-token handling + +- Speech vocab is `0..<6761`; tokens `6561..<6761` are the EOS range. +- `CosyVoice3Constants.stopRange = 6561...6760` (closed range). The decode + loop breaks when `topId` falls in that range. +- If the prefill emits a stop token at step 0 the synthesizer throws + `CosyVoice3Error.predictionFailed` instead of falling through — + feeding the stop-token embedding into the decode loop would + accumulate semantically meaningless tokens. + +## CLI + +``` +fluidaudio tts --backend cosyvoice3 \ + --text "希望你以后能够做的比我还好用" \ + --models-dir ~/.cache/fluidaudio/Models/cosyvoice3 \ + --tokenizer-dir … --embeddings-file … --special-tokens-file … \ + --prompt-assets path/to/voice.safetensors \ + --output out.wav +``` + +`--backend cosyvoice3` (and the `cv3` alias) runs the production +text-driven synthesis path. `--backend` help text flags it as +`[BETA — slow, RTFx < 1.0]` and the dispatcher emits a runtime +`logger.warning` so the beta status shows up without reading docs. + +### Dev sub-backends (for debugging the Python ↔ Swift contract) + +These are the harnesses future contributors use to bisect divergence +between the Swift port and the upstream Python reference. Each isolates +a distinct stage of the pipeline: + +``` +fluidaudio tts --backend cosyvoice3-tokenizer-parity \ + --tokenizer-dir … --fixture tokenizer_fixture.json +# Qwen2 BPE encode/decode parity vs tiktoken reference + +fluidaudio tts --backend cosyvoice3-frontend-parity \ + --tokenizer-dir … --embeddings-file … \ + --fixture shipping.safetensors --tok-fixture … +# lm_input_embeds assembly parity (text+speech embed lookup, SOS/TASK splice) + +fluidaudio tts --backend cosyvoice3-parity \ + --fixture shipping.safetensors --models-dir build/ +# Phase 1 fixture parity (Synthesizer: prefill → decode → Flow → HiFT) +``` + +Recommended bisection order when end-to-end output diverges from +Python: tokenizer-parity → frontend-parity → fixture parity. + +The production backend auto-downloads its CoreML mlpackages, tokenizer, +embeddings, and default voice from HuggingFace on first synthesis (cached +under `~/.cache/fluidaudio/Models/cosyvoice3/`) — there is no separate +download CLI mode, matching how Kokoro and PocketTTS work. + +## Models + +| Component | mlpackage | Precision | Notes | +|---|---|---|---| +| Qwen2 LLM — Prefill (T=256, M=768) | `LLM-Prefill-T256-M768-fp16` | fp16 | KV-cache out | +| Qwen2 LLM — Decode (M=768) | `LLM-Decode-M768-fp16` | fp16 | KV-cache in-place | +| CFM Flow (N=250 → M=500 mel) | `Flow-N250-fp32` | fp32 | CPU/GPU only | +| HiFT vocoder (T=500 → 10 s @ 24 kHz) | `HiFT-T500-fp16` | fp16 | sinegen on CPU | +| Qwen2 + speech embedding tables | `embeddings-fp16.safetensors` | fp16 | mmap'd at runtime | + +All shipped at +[`FluidInference/CosyVoice3-0.5B-coreml`](https://huggingface.co/FluidInference/CosyVoice3-0.5B-coreml). +The conversion pipeline that produced them lives in +[FluidInference/mobius#42](https://github.com/FluidInference/mobius/pull/42). + +## Non-goals / known limits + +- **No on-device prompt-asset preparation.** SpeechTokenizerV3 and + CAMPPlus have CoreML mlpackages but the surrounding DSP isn't ported + to Swift yet. Callers either use the bundled + `cosyvoice3-default-zh` voice or run the Python `extract_voice_prompt.py` + offline. +- **No production-grade Mandarin TN.** `CosyVoice3ChineseNormalizer` + only mirrors the simple cleanups in upstream `frontend_utils.py`. + For year / currency / decimal / unit normalization, run + `wetext.ZhNormalizer` server-side and pass `prenormalized: true` on + `synthesize()`. +- **Flow stays fp32 (~1.2 GB).** Until CoreMLTools pins fused-`layer_norm` + fp16 the model NaNs on ANE. Loaded once, kept resident. +- **Streaming API not yet exposed.** The synthesizer runs Phase 1 + (prefill) and Phase 2 (Flow + HiFT) sequentially against the full + token sequence. Token streaming is internal but not surfaced through + an `AsyncStream`. + +## License + +- **CosyVoice3 model weights:** Apache 2.0, inherited from + [FunAudioLLM/CosyVoice](https://github.com/FunAudioLLM/CosyVoice) + upstream (`speech_300m`, `Fun-CosyVoice3-0.5B-2512`). +- **FluidAudio SDK:** Apache 2.0. diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 68d058731..437b0422f 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -29,6 +29,7 @@ public enum Repo: String, CaseIterable, Sendable { case qwen3AsrInt8 = "FluidInference/qwen3-asr-0.6b-coreml/int8" case multilingualG2p = "FluidInference/charsiu-g2p-byt5-coreml" case parakeetTdtCtc110m = "FluidInference/parakeet-tdt-ctc-110m-coreml" + case cosyvoice3 = "FluidInference/CosyVoice3-0.5B-coreml" case cohereTranscribeCoreml = "FluidInference/cohere-transcribe-03-2026-coreml/q8" /// Repository slug (without owner) @@ -82,6 +83,8 @@ public enum Repo: String, CaseIterable, Sendable { return "charsiu-g2p-byt5-coreml" case .parakeetTdtCtc110m: return "parakeet-tdt-ctc-110m-coreml" + case .cosyvoice3: + return "CosyVoice3-0.5B-coreml" case .cohereTranscribeCoreml: return "cohere-transcribe-03-2026-coreml/q8" } @@ -178,6 +181,8 @@ public enum Repo: String, CaseIterable, Sendable { return "parakeet-ja" case .parakeetTdtCtc110m: return "parakeet-tdt-ctc-110m" + case .cosyvoice3: + return "cosyvoice3" case .cohereTranscribeCoreml: return "cohere-transcribe/q8" default: @@ -596,6 +601,47 @@ public enum ModelNames { ] } + /// CosyVoice3 (Mandarin) model names. Files live on HuggingFace at + /// `FluidInference/CosyVoice3-0.5B-coreml` (see `Repo.cosyvoice3`). The + /// expected local directory layout is encoded in `CosyVoice3Constants.Files`. + public enum CosyVoice3 { + public static let llmPrefill = "LLM-Prefill-T256-M768-fp16" + public static let llmDecode = "LLM-Decode-M768-fp16-stateful" + public static let flow = "Flow-N250-fp16" + public static let hift = "HiFT-T500-fp16" + public static let speechEmbeddings = "speech_embedding-fp16.safetensors" + + public static let llmPrefillFile = llmPrefill + ".mlmodelc" + public static let llmDecodeFile = llmDecode + ".mlmodelc" + public static let flowFile = flow + ".mlmodelc" + public static let hiftFile = hift + ".mlmodelc" + + public static let requiredModels: Set = [ + llmPrefillFile, + llmDecodeFile, + flowFile, + hiftFile, + ] + + /// Sidecar assets living under subdirectories of the HF repo (not part + /// of `requiredModels`; pulled via `downloadSubdirectory` / direct file + /// fetch by `CosyVoice3ResourceDownloader`). + public enum Sidecar { + public static let embeddingsDir = "embeddings" + public static let tokenizerDir = "tokenizer" + public static let voicesDir = "voices" + + public static let speechEmbeddings = "speech_embedding-fp16.safetensors" + public static let runtimeEmbeddings = "embeddings-runtime-fp32.safetensors" + public static let specialTokens = "special_tokens.json" + public static let vocab = "vocab.json" + public static let merges = "merges.txt" + public static let tokenizerConfig = "tokenizer_config.json" + + public static let defaultVoiceId = "cosyvoice3-default-zh" + } + } + /// Multilingual G2P (CharsiuG2P ByT5) model names public enum MultilingualG2P { public static let encoder = "MultilingualG2PEncoder" @@ -798,6 +844,8 @@ public enum ModelNames { return ModelNames.Qwen3ASR.requiredModelsFull case .multilingualG2p: return ModelNames.MultilingualG2P.requiredModels + case .cosyvoice3: + return ModelNames.CosyVoice3.requiredModels case .cohereTranscribeCoreml: return ModelNames.CohereTranscribe.requiredModels } diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift new file mode 100644 index 000000000..7051143f7 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift @@ -0,0 +1,186 @@ +@preconcurrency import CoreML +import Foundation + +/// Actor-based store for the four CosyVoice3 CoreML models. +/// +/// Two on-disk layouts are accepted: +/// +/// 1. **HuggingFace cache** (flat): `/.mlmodelc` (or +/// `.mlpackage`) at repo root, with `/embeddings/speech_embedding-fp16.safetensors`. +/// This is what `CosyVoice3ResourceDownloader` produces. +/// +/// 2. **Local mobius build dir**: `//.mlpackage` as +/// emitted by `models/tts/cosyvoice3/coreml/convert-coreml.py` (with +/// `llm-fp16/`, `flow-fp16-n250/`, `hift-fp16-t500/` subdirs). +/// +/// The store probes layout (1) first, then falls back to (2). CoreML +/// auto-compiles `.mlpackage` on first load and caches the compiled bundle on +/// disk. +public actor CosyVoice3ModelStore { + + private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "CosyVoice3ModelStore") + + public nonisolated let directory: URL + private let computeUnits: MLComputeUnits + + private var loadedModels: CosyVoice3Models? + private var speechEmbeddingsURL: URL? + + /// - Parameters: + /// - directory: Base build directory that contains + /// `llm-fp16/`, `llm-fp16-stateful/`, `flow-fp16-n250/`, + /// `hift-fp16-t500/`, `embeddings/`. + /// - computeUnits: Defaults to `.cpuAndNeuralEngine`. Applied to + /// LLM-Prefill + HiFT models only. LLM-Decode (stateful) and Flow + /// both force `.cpuAndGPU` regardless (see `loadIfNeeded()`). + public init(directory: URL, computeUnits: MLComputeUnits = .cpuAndNeuralEngine) { + self.directory = directory + self.computeUnits = computeUnits + } + + /// Load all four CoreML models. Idempotent. + public func loadIfNeeded() async throws { + guard loadedModels == nil else { return } + + let config = MLModelConfiguration() + config.computeUnits = computeUnits + + let loadStart = Date() + logger.info("Loading CosyVoice3 CoreML models from \(directory.path)...") + + let prefillURL = try resolveModel( + subdir: CosyVoice3Constants.Files.llmPrefillSubdir, + baseName: ModelNames.CosyVoice3.llmPrefill) + let decodeURL = try resolveModel( + subdir: CosyVoice3Constants.Files.llmDecodeSubdir, + baseName: ModelNames.CosyVoice3.llmDecode) + let flowURL = try resolveModel( + subdir: CosyVoice3Constants.Files.flowSubdir, + baseName: ModelNames.CosyVoice3.flow) + let hiftURL = try resolveModel( + subdir: CosyVoice3Constants.Files.hiftSubdir, + baseName: ModelNames.CosyVoice3.hift) + let embeddingsURL = try resolveAsset( + subdir: CosyVoice3Constants.Files.speechEmbeddingsSubdir, + file: CosyVoice3Constants.Files.speechEmbeddings) + + let prefill = try await compileAndLoad(prefillURL, configuration: config) + logger.info("Loaded \(CosyVoice3Constants.Files.llmPrefill)") + + // Stateful decode MUST run on `.cpuAndGPU`: + // - ANE refuses to compile the stateful graph (same failure mode + // as Flow: `MILCompilerForANE ANECCompile() FAILED`), so + // `.cpuAndNE` / `.all` deadlock load + // - CPU-only works but is ~2× slower than the GPU path + // Ignore the user-supplied `computeUnits` for decode. + let decodeConfig = MLModelConfiguration() + decodeConfig.computeUnits = .cpuAndGPU + let decode = try await compileAndLoad(decodeURL, configuration: decodeConfig) + logger.info("Loaded \(CosyVoice3Constants.Files.llmDecode)") + + // Flow runs on `.cpuAndGPU` (fp16). An ANE-port attempt (BC1S + // rewrite: Linear→Conv2d(1×1), LayerNorm on axis=1, manual SDPA, + // pre-baked rotary sin/cos) produced a Flow that *compiled* and + // ran ~3× faster, but numerically broken: on the parity + // fixture the ANE graph collapses the mel dynamic range from + // [-12.5, +5.2] to [-10.1, -0.8] (MAE 2.58 vs PyTorch fp32; + // plan required <1e-3), yielding HiFT audio at ~40× lower peak + // amplitude — unintelligible to both CTC-ZH and Qwen3 ASR. + // Reverted to the cpuAndGPU fp16 baseline. See + // `coreml/TRIALS_AND_ERRORS.md` "Flow ANE port" for the full + // journey including the residual 77-op `conv_pos_embed` CPU + // island that may have been masking the dynamic-range + // compression introduced elsewhere in the BC1S rewrite. + // Ignore the user-supplied `computeUnits` for Flow; apply it to + // the LLM + HiFT models only. + let flowConfig = MLModelConfiguration() + flowConfig.computeUnits = .cpuAndGPU + let flow = try await compileAndLoad(flowURL, configuration: flowConfig) + logger.info("Loaded \(CosyVoice3Constants.Files.flow)") + + let hift = try await compileAndLoad(hiftURL, configuration: config) + logger.info("Loaded \(CosyVoice3Constants.Files.hift)") + + loadedModels = CosyVoice3Models(prefill: prefill, decode: decode, flow: flow, hift: hift) + speechEmbeddingsURL = embeddingsURL + + let elapsed = Date().timeIntervalSince(loadStart) + logger.info("All CosyVoice3 models loaded in \(String(format: "%.2f", elapsed))s") + } + + public func models() throws -> CosyVoice3Models { + guard let models = loadedModels else { + throw CosyVoice3Error.notInitialized + } + return models + } + + public func speechEmbeddingsFileURL() throws -> URL { + guard let url = speechEmbeddingsURL else { + throw CosyVoice3Error.notInitialized + } + return url + } + + // MARK: - Helpers + + /// Resolve a CoreML model accepting either `.mlmodelc` or `.mlpackage` + /// extensions and both layouts: flat (HF) or subdir (local build). + private func resolveModel(subdir: String, baseName: String) throws -> URL { + let candidates: [URL] = [ + // HF flat layout prefers the precompiled .mlmodelc. + directory.appendingPathComponent("\(baseName).mlmodelc"), + directory.appendingPathComponent("\(baseName).mlpackage"), + // Local build layout (mobius convert-coreml.py output). + directory.appendingPathComponent(subdir).appendingPathComponent("\(baseName).mlmodelc"), + directory.appendingPathComponent(subdir).appendingPathComponent("\(baseName).mlpackage"), + ] + for url in candidates where FileManager.default.fileExists(atPath: url.path) { + return url + } + let probed = candidates.map { $0.path }.joined(separator: ", ") + throw CosyVoice3Error.modelFileNotFound(probed) + } + + /// Resolve a plain sidecar file (e.g. `speech_embedding-fp16.safetensors`). + /// Probes `//` then `/`. + private func resolveAsset(subdir: String, file: String) throws -> URL { + let candidates: [URL] = [ + directory.appendingPathComponent(subdir).appendingPathComponent(file), + directory.appendingPathComponent(file), + ] + for url in candidates where FileManager.default.fileExists(atPath: url.path) { + return url + } + let probed = candidates.map { $0.path }.joined(separator: ", ") + throw CosyVoice3Error.modelFileNotFound(probed) + } + + /// Compile an .mlpackage to .mlmodelc (cached in a persistent temp dir + /// next to the original package) and load it. Skips compilation if an + /// already-compiled .mlmodelc exists next to the package. + private func compileAndLoad( + _ url: URL, + configuration: MLModelConfiguration + ) async throws -> MLModel { + if url.pathExtension == "mlmodelc" { + return try MLModel(contentsOf: url, configuration: configuration) + } + let base = url.deletingPathExtension().lastPathComponent + let compiledName = base + ".mlmodelc" + let cached = url.deletingLastPathComponent().appendingPathComponent(compiledName) + if FileManager.default.fileExists(atPath: cached.path) { + return try MLModel(contentsOf: cached, configuration: configuration) + } + let compiledURL = try await MLModel.compileModel(at: url) + // Move into place next to the package so subsequent loads are fast. + try? FileManager.default.removeItem(at: cached) + do { + try FileManager.default.moveItem(at: compiledURL, to: cached) + return try MLModel(contentsOf: cached, configuration: configuration) + } catch { + // If the move fails (e.g. cross-device), load from the temp URL. + return try MLModel(contentsOf: compiledURL, configuration: configuration) + } + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift new file mode 100644 index 000000000..7359ddd43 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift @@ -0,0 +1,218 @@ +import Foundation + +/// Pulls CosyVoice3 CoreML models + runtime assets from the +/// `FluidInference/CosyVoice3-0.5B-coreml` HuggingFace repo. +/// +/// Layout produced on disk (relative to `ensureCoreModels(...)`'s return URL): +/// +/// ``` +/// / +/// ├── LLM-Prefill-T256-M768-fp16.mlmodelc/ +/// ├── LLM-Decode-M768-fp16.mlmodelc/ +/// ├── Flow-N250-fp16.mlmodelc/ +/// ├── HiFT-T500-fp16.mlmodelc/ +/// ├── embeddings/ +/// │ ├── speech_embedding-fp16.safetensors +/// │ └── embeddings-runtime-fp32.safetensors (text-mode only) +/// ├── tokenizer/ +/// │ ├── vocab.json, merges.txt, tokenizer_config.json, special_tokens.json +/// └── voices/ +/// ├── cosyvoice3-default-zh.safetensors + .json (default voice, eager) +/// └── .safetensors + .json (optional, on-demand) +/// ``` +public enum CosyVoice3ResourceDownloader { + + private static let logger = AppLogger( + subsystem: "com.fluidaudio.tts", category: "CosyVoice3ResourceDownloader") + + /// Path bundle produced by `ensureTextFrontendAssets`. + public struct TextFrontendPaths: Sendable { + public let tokenizerDirectory: URL + public let runtimeEmbeddingsFile: URL + public let specialTokensFile: URL + } + + // MARK: - Core models + speech embedding table + + /// Ensure the four `.mlmodelc` bundles and `speech_embedding-fp16.safetensors` + /// are cached locally. Returns the repository root directory. + /// + /// - Parameters: + /// - directory: Optional base cache dir. When `nil`, defaults to + /// `~/.cache/fluidaudio` (macOS) or `Caches/fluidaudio` (iOS). + /// - progressHandler: Forwarded to `DownloadUtils.downloadRepo`. + @discardableResult + public static func ensureCoreModels( + directory: URL? = nil, + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> URL { + let targetDir = try directory ?? cacheDirectory() + let modelsDirectory = targetDir.appendingPathComponent( + CosyVoice3Constants.defaultModelsSubdirectory) + let repoDir = modelsDirectory.appendingPathComponent(Repo.cosyvoice3.folderName) + + // 1. Fetch the four .mlmodelc bundles via the standard repo downloader. + let modelsPresent = ModelNames.CosyVoice3.requiredModels.allSatisfy { name in + FileManager.default.fileExists( + atPath: repoDir.appendingPathComponent(name).path) + } + if !modelsPresent { + logger.info("Downloading CosyVoice3 .mlmodelc bundles from HuggingFace...") + try await DownloadUtils.downloadRepo( + .cosyvoice3, + to: modelsDirectory, + progressHandler: progressHandler) + } else { + logger.info("CosyVoice3 .mlmodelc bundles found in cache") + } + + // 2. Fetch the small speech-embedding table (sidecar, not a model). + _ = try await ensureSidecarFile( + subdir: ModelNames.CosyVoice3.Sidecar.embeddingsDir, + name: ModelNames.CosyVoice3.Sidecar.speechEmbeddings, + repoDirectory: repoDir, + description: "CosyVoice3 speech embedding table") + + return repoDir + } + + // MARK: - Text-mode assets (tokenizer + 542 MB runtime embeddings) + + /// Ensure tokenizer assets + `embeddings-runtime-fp32.safetensors` are on + /// disk. Only required when using `CosyVoice3TtsManager.synthesize(text:…)`; + /// fixture-mode callers may skip this. + public static func ensureTextFrontendAssets( + repoDirectory: URL + ) async throws -> TextFrontendPaths { + // Tokenizer subdirectory: vocab.json + merges.txt + special_tokens.json + // + tokenizer_config.json. `downloadSubdirectory` walks the tree and + // skips files already on disk. + let tokenizerDir = repoDirectory.appendingPathComponent( + ModelNames.CosyVoice3.Sidecar.tokenizerDir) + let tokenizerRequired = [ + ModelNames.CosyVoice3.Sidecar.vocab, + ModelNames.CosyVoice3.Sidecar.merges, + ModelNames.CosyVoice3.Sidecar.specialTokens, + ] + let tokenizerPresent = tokenizerRequired.allSatisfy { name in + FileManager.default.fileExists( + atPath: tokenizerDir.appendingPathComponent(name).path) + } + if !tokenizerPresent { + logger.info("Downloading CosyVoice3 tokenizer assets…") + try await DownloadUtils.downloadSubdirectory( + .cosyvoice3, + subdirectory: ModelNames.CosyVoice3.Sidecar.tokenizerDir, + to: repoDirectory) + } + + // Runtime text-embedding table (542 MB). Pulled as a file download so + // it never has to sit in RAM during transfer. + let runtimeEmbeddings = try await ensureSidecarFile( + subdir: ModelNames.CosyVoice3.Sidecar.embeddingsDir, + name: ModelNames.CosyVoice3.Sidecar.runtimeEmbeddings, + repoDirectory: repoDirectory, + description: "CosyVoice3 runtime text embedding table (542 MB)") + + return TextFrontendPaths( + tokenizerDirectory: tokenizerDir, + runtimeEmbeddingsFile: runtimeEmbeddings, + specialTokensFile: tokenizerDir.appendingPathComponent( + ModelNames.CosyVoice3.Sidecar.specialTokens)) + } + + // MARK: - Voice bundles + + /// Ensure the requested zero-shot voice bundle (`.safetensors` + + /// `.json`) is cached. Returns the `.safetensors` URL that + /// `CosyVoice3PromptAssets.load(from:)` expects — the loader derives the + /// `.json` sidecar path from it. + @discardableResult + public static func ensureVoice( + voiceId: String = ModelNames.CosyVoice3.Sidecar.defaultVoiceId, + repoDirectory: URL + ) async throws -> URL { + let sanitized = voiceId.filter { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" } + guard !sanitized.isEmpty, sanitized == voiceId else { + throw CosyVoice3Error.invalidShape("invalid voice id: \(voiceId)") + } + + let voicesDir = repoDirectory.appendingPathComponent( + ModelNames.CosyVoice3.Sidecar.voicesDir) + try FileManager.default.createDirectory( + at: voicesDir, withIntermediateDirectories: true) + + let tensorsURL = voicesDir.appendingPathComponent("\(voiceId).safetensors") + let metadataURL = voicesDir.appendingPathComponent("\(voiceId).json") + + for (local, remoteName, desc) in [ + (tensorsURL, "\(voiceId).safetensors", "voice tensors"), + (metadataURL, "\(voiceId).json", "voice metadata"), + ] { + if FileManager.default.fileExists(atPath: local.path) { continue } + let remotePath = "\(ModelNames.CosyVoice3.Sidecar.voicesDir)/\(remoteName)" + let remoteURL = try ModelRegistry.resolveModel( + Repo.cosyvoice3.remotePath, remotePath) + let descriptor = AssetDownloader.Descriptor( + description: "\(voiceId) \(desc)", + remoteURL: remoteURL, + destinationURL: local, + transferMode: .file()) + _ = try await AssetDownloader.ensure(descriptor, logger: logger) + } + + return tensorsURL + } + + // MARK: - Helpers + + private static func ensureSidecarFile( + subdir: String, + name: String, + repoDirectory: URL, + description: String + ) async throws -> URL { + let localDir = repoDirectory.appendingPathComponent(subdir) + try FileManager.default.createDirectory( + at: localDir, withIntermediateDirectories: true) + let localURL = localDir.appendingPathComponent(name) + if FileManager.default.fileExists(atPath: localURL.path) { + return localURL + } + let remotePath = "\(subdir)/\(name)" + let remoteURL = try ModelRegistry.resolveModel( + Repo.cosyvoice3.remotePath, remotePath) + let descriptor = AssetDownloader.Descriptor( + description: description, + remoteURL: remoteURL, + destinationURL: localURL, + transferMode: .file()) + return try await AssetDownloader.ensure(descriptor, logger: logger) + } + + /// `~/.cache/fluidaudio` (macOS) / `Caches/fluidaudio` (iOS) — matches the + /// convention used by `TtsResourceDownloader` and `PocketTtsResourceDownloader`. + private static func cacheDirectory() throws -> URL { + let baseDirectory: URL + #if os(macOS) + baseDirectory = FileManager.default.homeDirectoryForCurrentUser + .appendingPathComponent(".cache") + #else + guard + let first = FileManager.default.urls( + for: .cachesDirectory, in: .userDomainMask + ).first + else { + throw CosyVoice3Error.invalidShape("failed to locate caches directory") + } + baseDirectory = first + #endif + + let cacheDirectory = baseDirectory.appendingPathComponent("fluidaudio") + if !FileManager.default.fileExists(atPath: cacheDirectory.path) { + try FileManager.default.createDirectory( + at: cacheDirectory, withIntermediateDirectories: true) + } + return cacheDirectory + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift new file mode 100644 index 000000000..b0a46f935 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift @@ -0,0 +1,78 @@ +import Foundation + +/// Central constants for the CosyVoice3 (Mandarin) CoreML pipeline. +/// +/// Shipping config (frozen): +/// - LLM-Prefill-T256-M768-fp16 (cpuAndNeuralEngine) +/// - LLM-Decode-M768-fp16-stateful (cpuAndGPU — see note) +/// - Flow-N250-fp16 (cpuAndGPU — an ANE-port +/// BC1S rewrite was attempted and reverted: the converted graph ran +/// ~3× faster but numerically broken (mel dynamic range collapsed +/// from [-12.5, +5.2] to [-10.1, -0.8], MAE 2.58 vs fp32 reference, +/// yielding HiFT audio at ~40× lower peak amplitude → unintelligible +/// to ASR). See `coreml/TRIALS_AND_ERRORS.md` "Flow ANE port" for +/// the full journey, including the residual 77-op CPU island in +/// `input_embed.conv_pos_embed` (`Conv1d(1024,1024,k=31)+Mish`) +/// that three rewrite attempts couldn't move — ANEF rejects the +/// conv footprint regardless of group count.) +/// - HiFT-T500-fp16 (cpuAndNeuralEngine) +/// +/// The stateful decode model uses per-layer `MLState` buffers for the +/// KV cache (48 tensors, `[1, 2, 768, 64]` fp16 each) instead of +/// round-tripping 18 MB of kv_k / kv_v MLMultiArrays every step. ANE +/// refuses to compile the stateful graph (`MILCompilerForANE +/// ANECCompile() FAILED`); decode therefore runs on `.cpuAndGPU`. +/// Requires macOS 15 / iOS 18. +public enum CosyVoice3Constants { + + // MARK: - LLM shapes + public static let prefillLength = 256 + public static let kvMaxLength = 768 + public static let embedDim = 896 + public static let numLayers = 24 + public static let kvHeads = 2 + public static let headDim = 64 + + // MARK: - Flow / HiFT shapes + public static let flowTotalTokens = 250 + public static let tokenMelRatio = 2 + public static let hiftMaxFrames = 500 + public static let hiftSamplesPerFrame = 480 + public static let sampleRate = 24_000 + public static let melBins = 80 + public static let speakerEmbeddingDim = 192 + + // MARK: - Speech token vocab + public static let speechVocab = 6_761 + public static let speechTokenSize = 6_561 + public static let sosId: Int32 = 6_561 + public static let eosId: Int32 = 6_562 + public static let taskId: Int32 = 6_563 + /// Any token id in this range is treated as a stop signal. + public static let stopRange: ClosedRange = 6_561...6_760 + + // MARK: - Sampler + public static let topP: Float = 0.8 + public static let topK: Int = 25 + public static let rasWindow: Int = 10 + public static let rasTauR: Float = 0.1 + + // MARK: - Cache layout + /// Subdirectory under the shared `~/.cache/fluidaudio/` (or iOS Caches) dir + /// where every TTS backend stores its HF-mirrored models. + public static let defaultModelsSubdirectory = "Models" + + // MARK: - Files (local build dir layout) + public enum Files { + public static let llmPrefill = "LLM-Prefill-T256-M768-fp16.mlpackage" + public static let llmPrefillSubdir = "llm-fp16" + public static let llmDecode = "LLM-Decode-M768-fp16-stateful.mlpackage" + public static let llmDecodeSubdir = "llm-fp16-stateful" + public static let flow = "Flow-N250-fp16.mlpackage" + public static let flowSubdir = "flow-fp16-n250" + public static let hift = "HiFT-T500-fp16.mlpackage" + public static let hiftSubdir = "hift-fp16-t500" + public static let speechEmbeddings = "speech_embedding-fp16.safetensors" + public static let speechEmbeddingsSubdir = "embeddings" + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift new file mode 100644 index 000000000..0ebe782f5 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift @@ -0,0 +1,37 @@ +import Foundation + +/// Errors surfaced by the CosyVoice3 Swift pipeline. +public enum CosyVoice3Error: LocalizedError, Sendable { + case notInitialized + case modelFileNotFound(String) + case invalidFixture(String) + case invalidSafetensors(String) + case prefillTooLong(Int) + case sequenceTooLong(Int) + case predictionFailed(String) + case embeddingTableMissing(String) + case invalidShape(String) + + public var errorDescription: String? { + switch self { + case .notInitialized: + return "CosyVoice3 pipeline not initialized — call loadIfNeeded() first." + case .modelFileNotFound(let path): + return "CosyVoice3 model file not found at: \(path)" + case .invalidFixture(let reason): + return "Invalid CosyVoice3 fixture: \(reason)" + case .invalidSafetensors(let reason): + return "Invalid safetensors file: \(reason)" + case .prefillTooLong(let length): + return "Prefill sequence length \(length) exceeds max \(CosyVoice3Constants.prefillLength)" + case .sequenceTooLong(let length): + return "KV cache length \(length) exceeds max \(CosyVoice3Constants.kvMaxLength)" + case .predictionFailed(let stage): + return "CosyVoice3 prediction failed at stage: \(stage)" + case .embeddingTableMissing(let name): + return "CosyVoice3 embedding table missing: \(name)" + case .invalidShape(let detail): + return "CosyVoice3 shape mismatch: \(detail)" + } + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift new file mode 100644 index 000000000..b608bdbfc --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift @@ -0,0 +1,23 @@ +@preconcurrency import CoreML +import Foundation + +/// Four CoreML models for the CosyVoice3 inference pipeline. +/// +/// `Sendable` conformance leans on `@preconcurrency import CoreML` (same +/// pattern as `TtsModels`). `MLModel` is reference-typed but its predict +/// surface is internally synchronized, and these instances are only handed +/// to actors that own them for their lifetime, so crossing actor isolation +/// is safe in practice. +public struct CosyVoice3Models: Sendable { + public let prefill: MLModel + public let decode: MLModel + public let flow: MLModel + public let hift: MLModel + + public init(prefill: MLModel, decode: MLModel, flow: MLModel, hift: MLModel) { + self.prefill = prefill + self.decode = decode + self.flow = flow + self.hift = hift + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift new file mode 100644 index 000000000..d71f3ea67 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift @@ -0,0 +1,314 @@ +@preconcurrency import CoreML +import Foundation + +/// Public entry point for the CosyVoice3 (Mandarin) TTS pipeline. +/// +/// > Important: **Experimental / beta.** This backend ships as an early port +/// > and end-to-end synthesis is currently **slow** on Apple Silicon — +/// > expect well below real-time (RTFx < 1.0) on M-series GPUs and several +/// > seconds of latency for short Mandarin utterances. The slowdown is +/// > primarily in the Flow CFM stage, which is fp32/CPU-or-GPU only because +/// > fp16 + ANE produces NaNs through the fused `layer_norm` (CoreMLTools +/// > limitation; tracked upstream). The HiFT vocoder also has ~12 sinegen / +/// > windowing ops that fall back to CPU. We do not yet know whether the +/// > residual cost is fundamental to the model or recoverable through better +/// > conversion — treat performance numbers as preliminary. The Swift API, +/// > model layout, and prompt-asset format may change in subsequent +/// > releases without deprecation aliases. +/// +/// Two synthesis paths are exposed: +/// +/// 1. `synthesizeFromFixture` — Phase 1 parity harness that replays a +/// Python-generated fixture against the Swift CoreML pipeline. +/// +/// 2. `synthesize(text:promptAssets:)` — Phase 2 text-driven synthesis. The +/// user supplies a Mandarin `text` plus a `CosyVoice3PromptAssets` bundle +/// (precomputed `llm_prompt_speech_ids`, `prompt_mel`, `spk_embedding`, +/// plus the prompt text containing `<|endofprompt|>`). The manager +/// tokenizes with the on-device Qwen2 BPE tokenizer, assembles +/// `lm_input_embeds` from the mmap'd runtime embedding tables, and runs +/// prefill → decode → Flow → HiFT exactly like the fixture path. +/// +/// Text-mode requires three extra resources that must be provided at init: +/// - `tokenizerDirectory`: HuggingFace Qwen2 assets (`vocab.json` + `merges.txt`). +/// - `textEmbeddingsFile`: `embeddings-runtime-fp32.safetensors` produced by +/// `mobius/.../verify/export_runtime_embeddings.py`. Contains Qwen2 +/// `text_embedding` and CosyVoice3 `speech_embedding` rows at runtime dtype. +/// - `specialTokensFile`: JSON map `{"<|endofprompt|>": 151646, ...}` covering +/// the 281 runtime-added special tokens (CosyVoice3Tokenizer). Same format +/// that `tokenizer_fixture.json` dumps under its `special_tokens` key. +/// +/// > Note: Gated to macOS 15 / iOS 18 because the underlying +/// > `CosyVoice3Synthesizer` uses CoreML `MLState` for the decode KV cache. +/// > Other FluidAudio modules (ASR, Diarization, VAD, Kokoro, PocketTTS) +/// > remain available on macOS 14 / iOS 17. +@available(macOS 15, iOS 18, *) +public actor CosyVoice3TtsManager { + + private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "CosyVoice3TtsManager") + + private let store: CosyVoice3ModelStore + private let tokenizerDirectory: URL? + private let textEmbeddingsFile: URL? + private let specialTokensFile: URL? + + private var synthesizer: CosyVoice3Synthesizer? + private var textFrontend: CosyVoice3TextFrontend? + + /// Fixture-only (Phase 1) constructor. + public init(directory: URL, computeUnits: MLComputeUnits = .cpuAndNeuralEngine) { + self.store = CosyVoice3ModelStore(directory: directory, computeUnits: computeUnits) + self.tokenizerDirectory = nil + self.textEmbeddingsFile = nil + self.specialTokensFile = nil + } + + /// Text-mode (Phase 2) constructor. Pass `modelsDirectory` plus the three + /// tokenizer-frontend resources. `synthesizeFromFixture` still works + /// without initializing the frontend. + public init( + modelsDirectory: URL, + tokenizerDirectory: URL, + textEmbeddingsFile: URL, + specialTokensFile: URL, + computeUnits: MLComputeUnits = .cpuAndNeuralEngine + ) { + self.store = CosyVoice3ModelStore(directory: modelsDirectory, computeUnits: computeUnits) + self.tokenizerDirectory = tokenizerDirectory + self.textEmbeddingsFile = textEmbeddingsFile + self.specialTokensFile = specialTokensFile + } + + /// Convenience factory that downloads all required assets from HuggingFace + /// (`FluidInference/CosyVoice3-0.5B-coreml`) into the shared FluidAudio + /// cache, then returns a text-mode–ready manager. + /// + /// - Parameters: + /// - cacheDirectory: Optional override for the base cache root. When + /// `nil`, uses `~/.cache/fluidaudio` (macOS) or the app Caches dir + /// (iOS) — the same location every other FluidAudio TTS backend uses. + /// - includeDefaultVoice: When `true` (default), also fetches the + /// upstream `cosyvoice3-default-zh` voice bundle so the first + /// `synthesize(...)` call works without any additional downloads. + /// - computeUnits: CoreML compute units for LLM + HiFT. Flow is forced + /// to CPU+GPU regardless (fp32 graph, ANE would NaN on fused LN). + /// - progressHandler: Forwarded to the HF downloader for UI updates. + /// - Returns: An uninitialized manager; the caller must still invoke + /// `initialize()` to compile + load models. A download of ~5.8 GB occurs + /// on first run; subsequent runs are cache hits. + public static func downloadAndCreate( + cacheDirectory: URL? = nil, + includeDefaultVoice: Bool = true, + computeUnits: MLComputeUnits = .cpuAndNeuralEngine, + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> CosyVoice3TtsManager { + let repoDir = try await CosyVoice3ResourceDownloader.ensureCoreModels( + directory: cacheDirectory, progressHandler: progressHandler) + let frontend = try await CosyVoice3ResourceDownloader.ensureTextFrontendAssets( + repoDirectory: repoDir) + if includeDefaultVoice { + _ = try await CosyVoice3ResourceDownloader.ensureVoice( + repoDirectory: repoDir) + } + return CosyVoice3TtsManager( + modelsDirectory: repoDir, + tokenizerDirectory: frontend.tokenizerDirectory, + textEmbeddingsFile: frontend.runtimeEmbeddingsFile, + specialTokensFile: frontend.specialTokensFile, + computeUnits: computeUnits) + } + + /// Ensure the given voice id (e.g. `"cosyvoice3-default-zh"` or an + /// `aishell3-zh-SSB####-{female|male}` id) is cached locally, and return + /// the loaded prompt bundle ready to pass into `synthesize(text:promptAssets:)`. + public func loadVoice( + _ voiceId: String = ModelNames.CosyVoice3.Sidecar.defaultVoiceId + ) async throws -> CosyVoice3PromptAssets { + let tensorsURL = try await CosyVoice3ResourceDownloader.ensureVoice( + voiceId: voiceId, + repoDirectory: modelsDirectory) + return try CosyVoice3PromptAssets.load(from: tensorsURL) + } + + /// Repo root directory (cache location after `downloadAndCreate(...)`). + /// Pass this to `CosyVoice3ResourceDownloader.ensureVoice(voiceId:repoDirectory:)` + /// when fetching additional voice bundles on demand. + public nonisolated var modelsDirectory: URL { + store.directory + } + + /// Load all four CoreML models + (if configured) the text frontend. + /// Idempotent. + public func initialize() async throws { + if synthesizer == nil { + logger.warning( + "CosyVoice3 is experimental / beta. Synthesis is currently slow " + + "(RTFx < 1.0 typical) — see CosyVoice3TtsManager docs.") + try await store.loadIfNeeded() + let models = try await store.models() + let embeddingsURL = try await store.speechEmbeddingsFileURL() + let embeddings = try CosyVoice3SpeechEmbeddings(url: embeddingsURL) + self.synthesizer = CosyVoice3Synthesizer(models: models, embeddings: embeddings) + logger.info("CosyVoice3 synthesizer ready") + } + if textFrontend == nil, + let tokDir = tokenizerDirectory, + let embURL = textEmbeddingsFile, + let specURL = specialTokensFile + { + let tokStart = Date() + let specialTokens = try Self.loadSpecialTokens(url: specURL) + let tokenizer = try Qwen2BpeTokenizer.load( + directory: tokDir, specialTokens: specialTokens) + let textEmbeddings = try CosyVoice3TextEmbeddings(url: embURL) + self.textFrontend = CosyVoice3TextFrontend( + tokenizer: tokenizer, embeddings: textEmbeddings) + logger.info( + "CosyVoice3 text frontend ready in \(String(format: "%.2fs", Date().timeIntervalSince(tokStart)))" + ) + } + } + + /// Phase 1 parity entry point. + public func synthesizeFromFixture( + fixtureURL: URL, + options: CosyVoice3ParityOptions = CosyVoice3ParityOptions() + ) async throws -> CosyVoice3SynthesisResult { + guard let synthesizer = synthesizer else { + throw CosyVoice3Error.notInitialized + } + let fixture = try CosyVoice3FrontendFixture.load(from: fixtureURL) + return try await synthesizer.synthesize(fixture: fixture, options: options) + } + + /// Phase 2 text-driven synthesis. + /// + /// - Parameters: + /// - text: Mandarin (or mixed) input text. + /// - promptAssets: Bundle with prompt text + precomputed speech prompt + /// tokens + prompt mel + speaker embedding. + /// - options: Sampling / seed controls. `replayDecodedTokens` must be + /// `false` in text mode (the default here). + /// - prenormalized: When `true`, skip the built-in minimal Chinese + /// normalizer and feed `text` straight to the tokenizer. Set this if + /// you've already run wetext (or equivalent) server-side. + public func synthesize( + text: String, + promptAssets: CosyVoice3PromptAssets, + options: CosyVoice3SynthesisOptions = CosyVoice3SynthesisOptions(), + prenormalized: Bool = false + ) async throws -> CosyVoice3SynthesisResult { + guard let synthesizer = synthesizer else { + throw CosyVoice3Error.notInitialized + } + guard let frontend = textFrontend else { + throw CosyVoice3Error.notInitialized + } + + // Skip normalization if the caller set `prenormalized`, if the input + // contains SSML-ish markers (mirrors Python's `'<|' in text and '|>'` + // bypass), or if there are no CJK characters at all. + let ssmlLike = text.contains("<|") && text.contains("|>") + let normalized: String + if prenormalized || ssmlLike || !CosyVoice3ChineseNormalizer.containsChinese(text) { + normalized = text + } else { + normalized = CosyVoice3ChineseNormalizer.normalize(text) + } + + let assembled = try frontend.assemble( + promptText: promptAssets.promptText, + ttsText: normalized, + promptSpeechIds: promptAssets.promptSpeechIds) + + let lmInputEmbedsFlat = try Self.flattenLmEmbeds( + assembled.lmInputEmbeds, tPre: assembled.tPre) + + // Build an in-memory fixture adapter so we can reuse the Phase 1 + // synthesize(fixture:) path without a second code path. + let fixture = CosyVoice3FrontendFixture( + lmInputEmbeds: lmInputEmbedsFlat, + tPre: assembled.tPre, + promptSpeechIds: promptAssets.promptSpeechIds, + promptMel: promptAssets.promptMel, + promptMelFrames: promptAssets.promptMelFrames, + spkEmbedding: promptAssets.spkEmbedding, + decodedTokens: [], + seed: Int32(truncatingIfNeeded: options.seed), + numPromptMel: 0, + audioLengthSamples: 0) + + let parityOptions = CosyVoice3ParityOptions( + maxNewTokens: options.maxNewTokens, + seed: options.seed, + replayDecodedTokens: false) + + return try await synthesizer.synthesize(fixture: fixture, options: parityOptions) + } + + // MARK: - Helpers + + /// Flatten `[1, tPre, 896]` MLMultiArray fp32 into `[tPre * 896]` Float, + /// honoring non-compact strides. + private static func flattenLmEmbeds( + _ array: MLMultiArray, tPre: Int + ) throws -> [Float] { + guard + array.dataType == .float32, + array.shape.count == 3, + array.shape[0].intValue == 1, + array.shape[1].intValue == tPre, + array.shape[2].intValue == CosyVoice3Constants.embedDim + else { + throw CosyVoice3Error.invalidShape( + "lmInputEmbeds expects [1, \(tPre), \(CosyVoice3Constants.embedDim)] fp32, got shape=\(array.shape) dtype=\(array.dataType.rawValue)" + ) + } + let dim = CosyVoice3Constants.embedDim + let strides = array.strides.map { $0.intValue } + let src = array.dataPointer.bindMemory(to: Float.self, capacity: array.count) + var out = [Float](repeating: 0, count: tPre * dim) + out.withUnsafeMutableBufferPointer { dst in + for t in 0...size) + } else { + for d in 0.. [String: Int32] { + let data = try Data(contentsOf: url) + // Accept either the tokenizer_fixture.json shape + // ({"special_tokens": {...}, "cases": [...]}) or a flat map. + let json = try JSONSerialization.jsonObject(with: data) + let raw: [String: Any] + if let obj = json as? [String: Any], let nested = obj["special_tokens"] as? [String: Any] { + raw = nested + } else if let obj = json as? [String: Any] { + raw = obj + } else { + throw CosyVoice3Error.invalidShape( + "special tokens file must be a JSON object, got \(type(of: json))") + } + var out: [String: Int32] = [:] + out.reserveCapacity(raw.count) + for (k, v) in raw { + if let n = v as? Int { + out[k] = Int32(n) + } else if let n = v as? NSNumber { + out[k] = n.int32Value + } + } + guard !out.isEmpty else { + throw CosyVoice3Error.invalidShape( + "special tokens file parsed to an empty map at \(url.path)") + } + return out + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift new file mode 100644 index 000000000..53457a8c1 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift @@ -0,0 +1,145 @@ +import Foundation + +/// Minimal Mandarin text normalizer ported from CosyVoice's +/// `cosyvoice/utils/frontend_utils.py` + the Chinese branch of +/// `cosyvoice/cli/frontend.py:text_normalize`. +/// +/// **Scope (intentional):** regex-free character-level rules plus digit +/// spellout. The full `wetext.ZhNormalizer` (which rewrites years, phone +/// numbers, decimals, units, chemistry, currency, dates…) is **not** ported. +/// Callers that need production-quality TN should run wetext server-side and +/// pass the result via `synthesize(text:prenormalized: true, ...)`. +/// +/// Rules applied (in order): +/// 1. strip newlines, leading/trailing whitespace +/// 2. `replaceCornerMark` — `²` → `平方`, `³` → `立方` +/// 3. ASCII digits → 零一二三四五六七八九 (per-digit fallback; lossy vs wetext +/// but avoids raw Arabic numerals going into the BPE) +/// 4. `.` → `。`, ` - ` → `,` +/// 5. `replaceBlank` — remove spaces between CJK chars; keep spaces between +/// ASCII tokens. Runs *after* the ASCII→CJK substitutions above so +/// spaces that became CJK-interior are also cleaned up. +/// 6. `removeBracket` — drop `()【】` and backticks, `——` → space +/// 7. trailing `,` / `,` / `、` sequences → `。` +public enum CosyVoice3ChineseNormalizer { + + public static func normalize(_ text: String) -> String { + var s = text + s = s.replacingOccurrences(of: "\n", with: "") + s = s.trimmingCharacters(in: .whitespaces) + s = replaceCornerMark(s) + s = spellOutDigitsZh(s) + s = s.replacingOccurrences(of: ".", with: "。") + s = s.replacingOccurrences(of: " - ", with: ",") + s = replaceBlank(s) + s = removeBracket(s) + s = stripTrailingCommaLikes(s) + return s + } + + /// True if `text` contains at least one CJK Unified Ideograph + /// (U+4E00..U+9FFF), matching `contains_chinese` in frontend_utils.py. + public static func containsChinese(_ text: String) -> Bool { + for scalar in text.unicodeScalars where (0x4E00...0x9FFF).contains(scalar.value) { + return true + } + return false + } + + /// True if `text` is empty or consists only of Unicode punctuation / + /// symbol characters. Mirrors `is_only_punctuation`. + public static func isOnlyPunctuation(_ text: String) -> Bool { + if text.isEmpty { return true } + let allowed: CharacterSet = { + var s = CharacterSet.punctuationCharacters + s.formUnion(.symbols) + s.formUnion(.whitespaces) + return s + }() + for scalar in text.unicodeScalars where !allowed.contains(scalar) { + return false + } + return true + } + + // MARK: - Individual rules + + /// Drop spaces between non-ASCII chars; keep spaces that sit between two + /// ASCII tokens (e.g. "hello world" stays, "中 国" → "中国"). + static func replaceBlank(_ text: String) -> String { + let chars = Array(text) + var out: [Character] = [] + out.reserveCapacity(chars.count) + for i in 0.. 0 ? chars[i - 1] : Character(" ") + let next = i + 1 < chars.count ? chars[i + 1] : Character(" ") + let prevOk = prev.isASCII && prev != " " + let nextOk = next.isASCII && next != " " + if prevOk && nextOk { + out.append(c) + } + } else { + out.append(c) + } + } + return String(out) + } + + static func replaceCornerMark(_ text: String) -> String { + var s = text + s = s.replacingOccurrences(of: "²", with: "平方") + s = s.replacingOccurrences(of: "³", with: "立方") + return s + } + + static func removeBracket(_ text: String) -> String { + var s = text + s = s.replacingOccurrences(of: "(", with: "") + s = s.replacingOccurrences(of: ")", with: "") + s = s.replacingOccurrences(of: "【", with: "") + s = s.replacingOccurrences(of: "】", with: "") + s = s.replacingOccurrences(of: "`", with: "") + s = s.replacingOccurrences(of: "——", with: " ") + return s + } + + /// Replace each ASCII digit in `text` with its Chinese reading. Lossy + /// per-digit fallback (e.g. `2024` → `二零二四`); correct for years / IDs + /// but wrong for decimals or large cardinals. Acceptable as a placeholder + /// while wetext remains server-side. + static func spellOutDigitsZh(_ text: String) -> String { + let map: [Character: String] = [ + "0": "零", "1": "一", "2": "二", "3": "三", "4": "四", + "5": "五", "6": "六", "7": "七", "8": "八", "9": "九", + ] + var out = "" + out.reserveCapacity(text.count) + for ch in text { + if let zh = map[ch] { + out += zh + } else { + out.append(ch) + } + } + return out + } + + /// Collapse a run of trailing `,` / `,` / `、` into a single `。`. + /// Equivalent to the Python `re.sub(r'[,,、]+$', '。', text)` rule. + static func stripTrailingCommaLikes(_ text: String) -> String { + let commaLikes: Set = [",", ",", "、"] + var chars = Array(text) + var end = chars.count + while end > 0, commaLikes.contains(chars[end - 1]) { + end -= 1 + } + if end == chars.count { + return text + } + chars = Array(chars[0.. CosyVoice3FrontendFixture { + let file = try SafetensorsFile(url: url) + + let lmInfo = try file.info("lm_input_embeds") + guard + lmInfo.dtype == .f32, + lmInfo.shape.count == 3, + lmInfo.shape[0] == 1, + lmInfo.shape[2] == CosyVoice3Constants.embedDim + else { + throw CosyVoice3Error.invalidFixture( + "lm_input_embeds expects [1, t_pre, 896] fp32, got shape=\(lmInfo.shape) dtype=\(lmInfo.dtype.rawValue)" + ) + } + let lmInputEmbeds = try file.asFloat32("lm_input_embeds") + let tPre = lmInfo.shape[1] + guard tPre > 0 && tPre <= CosyVoice3Constants.prefillLength else { + throw CosyVoice3Error.prefillTooLong(tPre) + } + + let promptIdsInfo = try file.info("llm_prompt_speech_ids") + guard + promptIdsInfo.shape.count == 2, + promptIdsInfo.shape[0] == 1 + else { + throw CosyVoice3Error.invalidFixture( + "llm_prompt_speech_ids expects [1, N], got \(promptIdsInfo.shape)") + } + let promptSpeechIds = try file.asInt32("llm_prompt_speech_ids") + + let promptMelInfo = try file.info("prompt_mel") + guard + promptMelInfo.dtype == .f32, + promptMelInfo.shape.count == 3, + promptMelInfo.shape[0] == 1, + promptMelInfo.shape[2] == CosyVoice3Constants.melBins + else { + throw CosyVoice3Error.invalidFixture( + "prompt_mel expects [1, frames, 80] fp32, got \(promptMelInfo.shape)") + } + let promptMel = try file.asFloat32("prompt_mel") + let promptMelFrames = promptMelInfo.shape[1] + + let spkInfo = try file.info("spk_embedding") + guard + spkInfo.dtype == .f32, + spkInfo.shape == [1, CosyVoice3Constants.speakerEmbeddingDim] + else { + throw CosyVoice3Error.invalidFixture( + "spk_embedding expects [1, 192] fp32, got \(spkInfo.shape)") + } + let spkEmbedding = try file.asFloat32("spk_embedding") + + let decodedTokens = try file.asInt32("decoded_tokens") + let seedValue = try file.asInt32("seed").first ?? 0 + + let numPromptMel = try file.asInt("num_prompt_mel") + let audioLengthSamples = try file.asInt("audio_length_samples") + + return CosyVoice3FrontendFixture( + lmInputEmbeds: lmInputEmbeds, + tPre: tPre, + promptSpeechIds: promptSpeechIds, + promptMel: promptMel, + promptMelFrames: promptMelFrames, + spkEmbedding: spkEmbedding, + decodedTokens: decodedTokens, + seed: seedValue, + numPromptMel: numPromptMel, + audioLengthSamples: audioLengthSamples) + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift new file mode 100644 index 000000000..0c10cd203 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift @@ -0,0 +1,115 @@ +import Foundation + +/// Zero-shot prompt assets bundled alongside CosyVoice3 inference. +/// +/// Phase 2 keeps SpeechTokenizer and CAMPPlus Python-side: `llmPromptSpeechIds` +/// and `spkEmbedding` are precomputed from a reference prompt WAV and shipped +/// as a single safetensors file with a JSON sidecar carrying the prompt text. +/// A later phase will regenerate these on-device once the SpeechTokenizer and +/// CAMPPlus DSPs + CoreML bindings land. +/// +/// The shipping layout mirrors what +/// `verify/export_swift_fixture.py` produces, so the Phase 1 fixture doubles +/// as a valid prompt-assets bundle: +/// +/// ``` +/// .safetensors +/// llm_prompt_speech_ids int32 [1, N_speech] +/// prompt_mel float32 [1, 2*N_speech, 80] +/// spk_embedding float32 [1, 192] +/// (any other tensors are ignored) +/// .json +/// { "prompt_text": "...", "tts_text": "..." } +/// ``` +public struct CosyVoice3PromptAssets: Sendable { + + /// Prompt text seed. MUST contain `<|endofprompt|>` (id 151646). + public let promptText: String + + /// Discrete speech token prefix fed to Flow (`token_total[:, :N_speech]`) + /// AND used to build the LLM prefill embed table. + public let promptSpeechIds: [Int32] + + /// Mel frames computed from the prompt WAV (`[1, 2*N_speech, 80]` fp32). + /// Flattened row-major `[frames * 80]`; `promptMelFrames` is the frame count. + public let promptMel: [Float] + public let promptMelFrames: Int + + /// CAMPPlus speaker embedding for the prompt voice (`[1, 192]` fp32). + public let spkEmbedding: [Float] + + public init( + promptText: String, + promptSpeechIds: [Int32], + promptMel: [Float], + promptMelFrames: Int, + spkEmbedding: [Float] + ) { + self.promptText = promptText + self.promptSpeechIds = promptSpeechIds + self.promptMel = promptMel + self.promptMelFrames = promptMelFrames + self.spkEmbedding = spkEmbedding + } + + /// Load from `.safetensors` + `.json` sidecar. + /// + /// - Parameter url: URL to the `.safetensors` file. The sidecar is expected + /// next to it with the same basename and `.json` extension. + public static func load(from url: URL) throws -> CosyVoice3PromptAssets { + let file = try SafetensorsFile(url: url) + + let idsInfo = try file.info("llm_prompt_speech_ids") + guard idsInfo.shape.count == 2, idsInfo.shape[0] == 1 else { + throw CosyVoice3Error.invalidFixture( + "llm_prompt_speech_ids expects [1, N], got \(idsInfo.shape)") + } + let promptSpeechIds = try file.asInt32("llm_prompt_speech_ids") + + let melInfo = try file.info("prompt_mel") + guard + melInfo.dtype == .f32, + melInfo.shape.count == 3, + melInfo.shape[0] == 1, + melInfo.shape[2] == CosyVoice3Constants.melBins + else { + throw CosyVoice3Error.invalidFixture( + "prompt_mel expects [1, frames, 80] fp32, got \(melInfo.shape)") + } + let promptMel = try file.asFloat32("prompt_mel") + let promptMelFrames = melInfo.shape[1] + + let spkInfo = try file.info("spk_embedding") + guard + spkInfo.dtype == .f32, + spkInfo.shape == [1, CosyVoice3Constants.speakerEmbeddingDim] + else { + throw CosyVoice3Error.invalidFixture( + "spk_embedding expects [1, 192] fp32, got \(spkInfo.shape)") + } + let spkEmbedding = try file.asFloat32("spk_embedding") + + let sidecarURL = url.deletingPathExtension().appendingPathExtension("json") + guard FileManager.default.fileExists(atPath: sidecarURL.path) else { + throw CosyVoice3Error.invalidFixture( + "prompt sidecar JSON not found next to \(url.lastPathComponent) — expected \(sidecarURL.lastPathComponent)" + ) + } + struct Sidecar: Decodable { let prompt_text: String } + let sidecar: Sidecar + do { + sidecar = try JSONDecoder().decode( + Sidecar.self, from: try Data(contentsOf: sidecarURL)) + } catch { + throw CosyVoice3Error.invalidFixture( + "failed to decode \(sidecarURL.lastPathComponent): \(error)") + } + + return CosyVoice3PromptAssets( + promptText: sidecar.prompt_text, + promptSpeechIds: promptSpeechIds, + promptMel: promptMel, + promptMelFrames: promptMelFrames, + spkEmbedding: spkEmbedding) + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift new file mode 100644 index 000000000..0e3f8a196 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift @@ -0,0 +1,307 @@ +import Accelerate +import Foundation + +/// On-device mel spectrogram extractor for CosyVoice3 prompt audio. +/// +/// Matches `matcha.utils.audio.mel_spectrogram` invoked from +/// `cosyvoice/cli/frontend.py:_extract_speech_feat` with the CosyVoice3 config +/// (see `examples/libritts/cosyvoice3/conf/cosyvoice3.yaml`): +/// +/// ``` +/// n_fft: 1920 +/// num_mels: 80 +/// sampling_rate: 24000 +/// hop_size: 480 +/// win_size: 1920 +/// fmin: 0 +/// fmax: null (→ sampling_rate / 2 = 12000 per librosa default) +/// center: False +/// ``` +/// +/// Pipeline (verbatim from the Python reference): +/// 1. reflect-pad the waveform by `(n_fft - hop_size) / 2 = 720` on each side +/// 2. framed STFT with `n_fft=1920, hop=480, win=1920`, periodic Hann window +/// (`torch.hann_window` default), `center=False` +/// 3. magnitude = `sqrt(real² + imag² + 1e-9)` (Matcha convention) +/// 4. `mel = mel_basis @ magnitude` using Slaney-normalized mel filterbank +/// (librosa default: HTK=False, norm='slaney') +/// 5. `log_mel = log(clamp(mel, min=1e-5))` +/// +/// The output is flattened `[T, 80]` row-major fp32, which is the layout +/// `CosyVoice3PromptAssets.promptMel` stores and the Flow model consumes as +/// `[1, 2*N_speech, 80]` after slicing to match the prompt-speech id count. +/// +/// Use `trimToTokenRatio(...)` to enforce the `frames == 2 * N_speech` +/// invariant before passing to Flow (matches the +/// `speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len` +/// clamp in the Python frontend). +public final class CosyVoice3PromptMel { + + public static let sampleRate = 24_000 + public static let nFFT = 1_920 + public static let hopSize = 480 + public static let winSize = 1_920 + public static let numMels = 80 + public static let fMin: Float = 0 + public static let fMax: Float = 12_000 // sr / 2 + /// Reflect-pad each side by `(n_fft - hop_size) / 2`. + public static let padLength = (nFFT - hopSize) / 2 // 720 + /// Magnitude epsilon before sqrt (prevents NaN gradients in training; kept + /// here for bit parity with the reference). + private static let magEps: Float = 1e-9 + /// Log floor clamp applied inside `log(clamp(x, min=1e-5))`. + private static let logFloor: Float = 1e-5 + + // Precomputed resources + private let hannWindow: [Float] + private let melBasis: [Float] // flat [numMels * numFreqBins] + private let numFreqBins: Int + private var fftSetup: vDSP_DFT_Setup? + + // Reusable buffers (not thread-safe; wrap with a queue if shared). + private var frameBuf: [Float] + private var realIn: [Float] + private var imagIn: [Float] + private var realOut: [Float] + private var imagOut: [Float] + private var magnitude: [Float] + private var imagSq: [Float] + + public init() { + self.numFreqBins = Self.nFFT / 2 + 1 + // torch.hann_window(N) defaults to periodic=True — sample i of length + // N is `0.5 * (1 - cos(2πi/N))`. This matches Matcha's code path via + // the torch.stft default. + self.hannWindow = Self.hannWindowPeriodic(length: Self.winSize) + self.melBasis = Self.buildSlaneyMelBasis( + sampleRate: Self.sampleRate, + nFFT: Self.nFFT, + numMels: Self.numMels, + fMin: Self.fMin, + fMax: Self.fMax) + self.fftSetup = vDSP_DFT_zop_CreateSetup(nil, vDSP_Length(Self.nFFT), .FORWARD) + self.frameBuf = [Float](repeating: 0, count: Self.nFFT) + self.realIn = [Float](repeating: 0, count: Self.nFFT) + self.imagIn = [Float](repeating: 0, count: Self.nFFT) + self.realOut = [Float](repeating: 0, count: Self.nFFT) + self.imagOut = [Float](repeating: 0, count: Self.nFFT) + self.magnitude = [Float](repeating: 0, count: numFreqBins) + self.imagSq = [Float](repeating: 0, count: numFreqBins) + } + + deinit { + if let setup = fftSetup { + vDSP_DFT_DestroySetup(setup) + } + } + + public struct Result: Sendable { + /// `[frames * numMels]` row-major, fp32. + public let mel: [Float] + public let frames: Int + } + + /// Compute the log-mel spectrogram for a 24 kHz mono waveform. + /// + /// - Parameter audio: fp32 PCM samples at 24 kHz, range ≈ [-1, 1]. + /// - Returns: `[T * 80]` row-major fp32 mel, where + /// `T = floor((len + 2·padLength - nFFT) / hopSize) + 1`. + public func compute(audio: [Float]) throws -> Result { + guard let setup = fftSetup else { + throw CosyVoice3Error.invalidShape("vDSP_DFT setup failed") + } + guard audio.count > 0 else { + return Result(mel: [], frames: 0) + } + + let padded = Self.reflectPad(audio, pad: Self.padLength) + let paddedCount = padded.count + let frames = max(0, (paddedCount - Self.nFFT) / Self.hopSize + 1) + guard frames > 0 else { + return Result(mel: [], frames: 0) + } + + var mel = [Float](repeating: 0, count: frames * Self.numMels) + + for frameIdx in 0...size) + } + } + vDSP_vclr(&imagIn, 1, vDSP_Length(Self.nFFT)) + vDSP_DFT_Execute(setup, realIn, imagIn, &realOut, &imagOut) + + // magnitude = sqrt(real² + imag² + 1e-9) over one-sided bins. + vDSP_vsq(realOut, 1, &magnitude, 1, vDSP_Length(numFreqBins)) + vDSP_vsq(imagOut, 1, &imagSq, 1, vDSP_Length(numFreqBins)) + vDSP_vadd(magnitude, 1, imagSq, 1, &magnitude, 1, vDSP_Length(numFreqBins)) + var eps = Self.magEps + vDSP_vsadd(magnitude, 1, &eps, &magnitude, 1, vDSP_Length(numFreqBins)) + var n = Int32(numFreqBins) + vvsqrtf(&magnitude, magnitude, &n) + + // mel = melBasis[80, numFreqBins] @ magnitude[numFreqBins] + var melFrame = [Float](repeating: 0, count: Self.numMels) + melBasis.withUnsafeBufferPointer { basisPtr in + magnitude.withUnsafeBufferPointer { magPtr in + melFrame.withUnsafeMutableBufferPointer { outPtr in + vDSP_mmul( + basisPtr.baseAddress!, 1, + magPtr.baseAddress!, 1, + outPtr.baseAddress!, 1, + vDSP_Length(Self.numMels), + vDSP_Length(1), + vDSP_Length(numFreqBins)) + } + } + } + + // log(clamp(x, min=1e-5)) + for m in 0.. (mel: [Float], frames: Int) { + let targetFrames = 2 * tokenCount + guard frames >= targetFrames else { + throw CosyVoice3Error.invalidShape( + "prompt mel has \(frames) frames but tokenCount=\(tokenCount) requires \(targetFrames)" + ) + } + if frames == targetFrames { + return (mel, frames) + } + let trimmed = Array(mel.prefix(targetFrames * numMels)) + return (trimmed, targetFrames) + } + + // MARK: - Helpers + + /// PyTorch `F.pad(..., mode="reflect")` on a 1-D signal: + /// - left: [y[pad], y[pad-1], ..., y[1]] + /// - core: y[0.. [Float] { + let n = y.count + if pad <= 0 { return y } + // PyTorch requires pad < n for reflect. Guard loudly for a silently + // bad prompt (very short audio). + precondition(pad < n, "reflect pad=\(pad) requires signal length > \(pad), got \(n)") + var out = [Float](repeating: 0, count: n + 2 * pad) + for i in 0.. [Float] { + var w = [Float](repeating: 0, count: length) + let divisor = Float(length) + for i in 0.. [Float] { + let numFreqBins = nFFT / 2 + 1 + + let melMin = hzToMelSlaney(fMin) + let melMax = hzToMelSlaney(fMax) + + var melPoints = [Float](repeating: 0, count: numMels + 2) + for i in 0..<(numMels + 2) { + let mel = melMin + Float(i) * (melMax - melMin) / Float(numMels + 1) + melPoints[i] = melToHzSlaney(mel) + } + + var fftFreqs = [Float](repeating: 0, count: numFreqBins) + for i in 0..= fLeft && freq < fCenter { + w = norm * (freq - fLeft) / (fCenter - fLeft) + } else if freq >= fCenter && freq <= fRight { + w = norm * (fRight - freq) / (fRight - fCenter) + } + basis[m * numFreqBins + f] = w + } + } + return basis + } + + static func hzToMelSlaney(_ hz: Float) -> Float { + let fSp: Float = 200.0 / 3.0 + let minLogHz: Float = 1_000.0 + let minLogMel: Float = minLogHz / fSp + let logStep: Float = log(6.4) / 27.0 + return hz >= minLogHz + ? minLogMel + log(hz / minLogHz) / logStep + : hz / fSp + } + + static func melToHzSlaney(_ mel: Float) -> Float { + let fSp: Float = 200.0 / 3.0 + let minLogHz: Float = 1_000.0 + let minLogMel: Float = minLogHz / fSp + let logStep: Float = log(6.4) / 27.0 + return mel >= minLogMel + ? minLogHz * exp(logStep * (mel - minLogMel)) + : fSp * mel + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift new file mode 100644 index 000000000..54aeaefff --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift @@ -0,0 +1,142 @@ +@preconcurrency import CoreML +import Foundation + +/// mmap'd reader for Qwen2 `text_embedding` [151936, 896] and CosyVoice3 +/// `speech_embedding` [6761, 896] tables (both fp32). Used by the Phase 2 +/// text frontend to assemble `lm_input_embeds` natively in Swift. +/// +/// The Phase 1 per-step decode embedding path still uses +/// `CosyVoice3SpeechEmbeddings` (fp16 table) to save memory during long +/// autoregressive loops; that code remains unchanged. +public final class CosyVoice3TextEmbeddings { + + private let file: SafetensorsFile + private let textBytes: Data + private let speechBytes: Data + public let textVocab: Int + public let speechVocab: Int + public let embedDim: Int + + public init(url: URL) throws { + let file = try SafetensorsFile(url: url) + guard let text = file.tensors["text_embedding"] else { + throw CosyVoice3Error.embeddingTableMissing("text_embedding") + } + guard let speech = file.tensors["speech_embedding"] else { + throw CosyVoice3Error.embeddingTableMissing("speech_embedding") + } + guard text.dtype == .f32, text.shape.count == 2 else { + throw CosyVoice3Error.invalidShape( + "text_embedding expects [vocab, 896] fp32, got shape=\(text.shape) dtype=\(text.dtype.rawValue)" + ) + } + guard speech.dtype == .f32, speech.shape.count == 2 else { + throw CosyVoice3Error.invalidShape( + "speech_embedding expects [vocab, 896] fp32, got shape=\(speech.shape) dtype=\(speech.dtype.rawValue)" + ) + } + guard text.shape[1] == speech.shape[1] else { + throw CosyVoice3Error.invalidShape( + "text_embedding dim=\(text.shape[1]) != speech_embedding dim=\(speech.shape[1])" + ) + } + self.file = file + self.textBytes = try file.rawBytes("text_embedding") + self.speechBytes = try file.rawBytes("speech_embedding") + self.textVocab = text.shape[0] + self.speechVocab = speech.shape[0] + self.embedDim = text.shape[1] + guard self.embedDim == CosyVoice3Constants.embedDim else { + throw CosyVoice3Error.invalidShape( + "embed_dim=\(embedDim) does not match CosyVoice3Constants.embedDim=\(CosyVoice3Constants.embedDim)" + ) + } + } + + /// Assemble LLM-Prefill input: + /// `lm_input = concat([sos, text_embedding[text_ids], task_id, speech_embedding[prompt_speech_ids]], dim=1)` + /// + /// Returns a `[1, T_pre, 896]` fp32 MLMultiArray and `T_pre = 1 + N_text + 1 + N_speech`. + /// The LLM-Prefill model expects T padded to 256; this method returns the + /// unpadded tensor — callers must pad or pass `T_pre` separately. + public func assembleLmInput( + textTokenIds: [Int32], + promptSpeechIds: [Int32], + sos: Int32 = CosyVoice3Constants.sosId, + taskId: Int32 = CosyVoice3Constants.taskId + ) throws -> (embeds: MLMultiArray, tPre: Int) { + let nText = textTokenIds.count + let nSpeech = promptSpeechIds.count + let tPre = 1 + nText + 1 + nSpeech + let dim = embedDim + let array = try MLMultiArray( + shape: [1, NSNumber(value: tPre), NSNumber(value: dim)], + dataType: .float32) + let strides = array.strides.map { $0.intValue } + let dst = array.dataPointer.bindMemory(to: Float.self, capacity: array.count) + + // Row t (within the T_pre axis) → destination pointer. + func row(_ t: Int) -> UnsafeMutablePointer { + dst.advanced(by: t * strides[1]) + } + + // 1) sos + try copySpeechRow(sos, into: row(0), stride: strides[2]) + // 2) text_embedding[text_ids] + for (i, id) in textTokenIds.enumerated() { + try copyTextRow(id, into: row(1 + i), stride: strides[2]) + } + // 3) task_id + try copySpeechRow(taskId, into: row(1 + nText), stride: strides[2]) + // 4) speech_embedding[prompt_speech_ids] + for (i, id) in promptSpeechIds.enumerated() { + try copySpeechRow(id, into: row(1 + nText + 1 + i), stride: strides[2]) + } + + return (array, tPre) + } + + // MARK: - Row copy + + private func copyTextRow( + _ id: Int32, into dst: UnsafeMutablePointer, stride: Int + ) throws { + guard id >= 0 && Int(id) < textVocab else { + throw CosyVoice3Error.invalidShape( + "text token id \(id) out of range [0, \(textVocab))") + } + let rowStart = Int(id) * embedDim * 4 + textBytes.withUnsafeBytes { src in + let basePtr = src.baseAddress!.advanced(by: rowStart) + .assumingMemoryBound(to: Float.self) + if stride == 1 { + memcpy(dst, basePtr, embedDim * 4) + } else { + for i in 0.., stride: Int + ) throws { + guard id >= 0 && Int(id) < speechVocab else { + throw CosyVoice3Error.invalidShape( + "speech token id \(id) out of range [0, \(speechVocab))") + } + let rowStart = Int(id) * embedDim * 4 + speechBytes.withUnsafeBytes { src in + let basePtr = src.baseAddress!.advanced(by: rowStart) + .assumingMemoryBound(to: Float.self) + if stride == 1 { + memcpy(dst, basePtr, embedDim * 4) + } else { + for i in 0..` token + /// (id 151646). The Python pipeline asserts this in + /// `cosyvoice/llm.py:478`. + public func assemble( + promptText: String, + ttsText: String, + promptSpeechIds: [Int32] + ) throws -> Assembled { + let promptIds = tokenizer.encode(promptText) + let ttsIds = tokenizer.encode(ttsText) + // Python asserts 151646 is present somewhere in the combined token + // stream. Enforce here to avoid silent parity breakage. + let endOfPrompt: Int32 = 151_646 + guard promptIds.contains(endOfPrompt) || ttsIds.contains(endOfPrompt) else { + throw CosyVoice3Error.invalidShape( + "<|endofprompt|> (id 151646) not present in promptText or ttsText") + } + let combined = promptIds + ttsIds + + let (embeds, tPre) = try embeddings.assembleLmInput( + textTokenIds: combined, + promptSpeechIds: promptSpeechIds) + guard tPre <= CosyVoice3Constants.prefillLength else { + throw CosyVoice3Error.invalidShape( + "assembled T_pre=\(tPre) exceeds LLM-Prefill length \(CosyVoice3Constants.prefillLength)" + ) + } + return Assembled(lmInputEmbeds: embeds, tPre: tPre, textTokenIds: combined) + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift new file mode 100644 index 000000000..29c39a8e6 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift @@ -0,0 +1,277 @@ +import Foundation + +/// Qwen2 byte-level BPE tokenizer. Mirrors +/// `transformers.models.qwen2.tokenization_qwen2.Qwen2Tokenizer` on the slow +/// path used by CosyVoice3 (`AutoTokenizer.from_pretrained(...)` + runtime +/// `add_special_tokens(...)` as done in `CosyVoice3Tokenizer`). +/// +/// Encoding pipeline: +/// 1. Split input on registered special tokens (longest-match first). Special +/// chunks map 1:1 to their fixed ID. +/// 2. Pretokenize non-special chunks with Qwen2's regex. +/// 3. UTF-8 encode each match and remap bytes via the GPT-2 byte→unicode +/// shim (`ByteEncoder` below). +/// 4. Apply BPE merges (lowest rank wins, all occurrences merged per pass). +/// 5. Look up the resulting symbols in `vocab.json` to get token IDs. +/// +/// Loader accepts the standard HuggingFace asset layout: +/// /vocab.json — {"symbol": id, ...} +/// /merges.txt — first line is a header or the first merge; +/// subsequent lines are "A B" pairs, rank = line idx. +/// Special tokens are passed in separately (from a JSON map exported alongside +/// the CosyVoice3 fixtures — the runtime add_special_tokens list in Python is +/// not encoded in the HF assets). +public final class Qwen2BpeTokenizer { + + public enum Error: Swift.Error, LocalizedError { + case fileNotFound(URL) + case invalidJSON(String) + case missingField(String) + case regexCompileFailed + + public var errorDescription: String? { + switch self { + case .fileNotFound(let url): return "file not found: \(url.path)" + case .invalidJSON(let m): return "invalid JSON: \(m)" + case .missingField(let f): return "missing field: \(f)" + case .regexCompileFailed: return "failed to compile pretokenize regex" + } + } + } + + /// Qwen2 pretokenize regex (see `transformers` PRETOKENIZE_REGEX). + /// Matches: contractions, letter words, single digits, punctuation runs, + /// newline-led whitespace, trailing whitespace. + public static let pretokenizePattern = + #"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"# + + private let vocab: [String: Int32] + private let mergeRanks: [String: Int] // "firstSpace second" -> rank + private let specialTokens: [String: Int32] + private let specialPattern: NSRegularExpression? + private let pretokenizeRegex: NSRegularExpression + + public init( + vocab: [String: Int32], + merges: [(String, String)], + specialTokens: [String: Int32] + ) throws { + self.vocab = vocab + var ranks: [String: Int] = [:] + ranks.reserveCapacity(merges.count) + for (i, pair) in merges.enumerated() { + ranks["\(pair.0) \(pair.1)"] = i + } + self.mergeRanks = ranks + self.specialTokens = specialTokens + + if !specialTokens.isEmpty { + // Longest-first so `<|endofprompt|>` wins over `<|end`. + let ordered = specialTokens.keys.sorted { $0.count > $1.count } + let alternation = ordered.map { NSRegularExpression.escapedPattern(for: $0) } + .joined(separator: "|") + self.specialPattern = try NSRegularExpression(pattern: alternation) + } else { + self.specialPattern = nil + } + + do { + self.pretokenizeRegex = try NSRegularExpression(pattern: Self.pretokenizePattern) + } catch { + throw Error.regexCompileFailed + } + } + + /// Load vocab.json + merges.txt from a directory and attach the runtime + /// special-token map (must be supplied externally; Python `AutoTokenizer` + /// adds these at import time via `add_special_tokens`). + public static func load( + directory: URL, + specialTokens: [String: Int32] + ) throws -> Qwen2BpeTokenizer { + let vocabURL = directory.appendingPathComponent("vocab.json") + let mergesURL = directory.appendingPathComponent("merges.txt") + guard FileManager.default.fileExists(atPath: vocabURL.path) else { + throw Error.fileNotFound(vocabURL) + } + guard FileManager.default.fileExists(atPath: mergesURL.path) else { + throw Error.fileNotFound(mergesURL) + } + + let vocabData = try Data(contentsOf: vocabURL) + guard let raw = try JSONSerialization.jsonObject(with: vocabData) as? [String: Int] else { + throw Error.invalidJSON("vocab.json is not {String: Int}") + } + var vocab: [String: Int32] = [:] + vocab.reserveCapacity(raw.count) + for (k, v) in raw { vocab[k] = Int32(v) } + + let mergesText = try String(contentsOf: mergesURL, encoding: .utf8) + var merges: [(String, String)] = [] + merges.reserveCapacity(140_000) + var isFirst = true + for line in mergesText.split(separator: "\n", omittingEmptySubsequences: true) { + if isFirst { + isFirst = false + // Typical merges.txt header: "#version: 0.2". Skip it. + if line.hasPrefix("#") { continue } + } + let parts = line.split(separator: " ", maxSplits: 1) + guard parts.count == 2 else { continue } + merges.append((String(parts[0]), String(parts[1]))) + } + + return try Qwen2BpeTokenizer(vocab: vocab, merges: merges, specialTokens: specialTokens) + } + + /// Encode text to token IDs. + public func encode(_ text: String) -> [Int32] { + var out: [Int32] = [] + splitBySpecial(text) { chunk, isSpecial in + if isSpecial { + if let id = specialTokens[chunk] { out.append(id) } + return + } + pretokenize(chunk) { piece in + let mapped = ByteEncoder.encode(piece.utf8) + let bpeTokens = bpe(mapped) + for tok in bpeTokens { + if let id = vocab[tok] { + out.append(id) + } else if let id = specialTokens[tok] { + out.append(id) + } + // Unknown token: Qwen2 has no . Drop silently as + // upstream never produces one for valid UTF-8 input. + } + } + } + return out + } + + // MARK: - Special token split + + private func splitBySpecial(_ text: String, _ handle: (String, Bool) -> Void) { + guard let regex = specialPattern, !text.isEmpty else { + if !text.isEmpty { handle(text, false) } + return + } + let ns = text as NSString + let range = NSRange(location: 0, length: ns.length) + var cursor = 0 + regex.enumerateMatches(in: text, options: [], range: range) { match, _, _ in + guard let m = match else { return } + if m.range.location > cursor { + let sub = ns.substring(with: NSRange(location: cursor, length: m.range.location - cursor)) + if !sub.isEmpty { handle(sub, false) } + } + handle(ns.substring(with: m.range), true) + cursor = m.range.location + m.range.length + } + if cursor < ns.length { + let sub = ns.substring(with: NSRange(location: cursor, length: ns.length - cursor)) + if !sub.isEmpty { handle(sub, false) } + } + } + + // MARK: - Pretokenize + + private func pretokenize(_ text: String, _ handle: (String) -> Void) { + guard !text.isEmpty else { return } + let ns = text as NSString + let range = NSRange(location: 0, length: ns.length) + pretokenizeRegex.enumerateMatches(in: text, options: [], range: range) { match, _, _ in + guard let m = match else { return } + if m.range.length > 0 { + handle(ns.substring(with: m.range)) + } + } + } + + // MARK: - BPE + + /// Standard GPT-2 BPE: repeatedly merge the lowest-rank adjacent pair + /// until no pair is mergeable, then return the final symbol list. + private func bpe(_ text: String) -> [String] { + if text.isEmpty { return [] } + var symbols = text.map { String($0) } + if symbols.count < 2 { return symbols } + + while true { + var bestRank = Int.max + var bestIndex = -1 + for i in 0..<(symbols.count - 1) { + let key = "\(symbols[i]) \(symbols[i + 1])" + if let r = mergeRanks[key], r < bestRank { + bestRank = r + bestIndex = i + } + } + if bestIndex < 0 { break } + + let first = symbols[bestIndex] + let second = symbols[bestIndex + 1] + var merged: [String] = [] + merged.reserveCapacity(symbols.count - 1) + var i = 0 + while i < symbols.count { + if i < symbols.count - 1 && symbols[i] == first && symbols[i + 1] == second { + merged.append(first + second) + i += 2 + } else { + merged.append(symbols[i]) + i += 1 + } + } + symbols = merged + if symbols.count < 2 { break } + } + return symbols + } + + // MARK: - Byte encoder + + /// GPT-2 style reversible byte→unicode mapping used by Qwen2 BPE. + /// + /// Mirrors `transformers.models.qwen2.tokenization_qwen2.bytes_to_unicode`: + /// - Printable ASCII, Latin-1 supplement (¡..¬), and (®..ÿ) map to themselves. + /// - The 68 "unprintable" bytes are remapped to code points 256..323. + /// + /// After mapping, every byte of a UTF-8 string becomes a single-code-point + /// unicode character that vocab/merges.txt expect. + fileprivate enum ByteEncoder { + + /// byte (0..255) → single Unicode scalar. + static let byteToUnicode: [Character] = { + var map = [Character](repeating: Character(" "), count: 256) + var printable = [Int]() + printable.reserveCapacity(188) + printable.append(contentsOf: Int(Character("!").asciiValue!)...Int(Character("~").asciiValue!)) + printable.append(contentsOf: 0xA1...0xAC) + printable.append(contentsOf: 0xAE...0xFF) + + for b in printable { + map[b] = Character(UnicodeScalar(b)!) + } + + var extra = 0 + for b in 0..<256 { + if !printable.contains(b) { + let scalar = UnicodeScalar(256 + extra)! + map[b] = Character(scalar) + extra += 1 + } + } + return map + }() + + /// Encode a UTF-8 byte sequence as a string of mapped characters. + static func encode(_ bytes: some Sequence) -> String { + var out = "" + for b in bytes { + out.append(byteToUnicode[Int(b)]) + } + return out + } + } +} diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift new file mode 100644 index 000000000..f4cd579c0 --- /dev/null +++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift @@ -0,0 +1,175 @@ +import Foundation + +/// RAS (Repetition-Aware Sampling) — top-p nucleus sampling with a repetition +/// mask that re-samples if a token fires too often in the recent window. +/// +/// Mirrors `ras_sampling` in +/// `mobius/.../verify/test_coreml_e2e_fp16.py`: +/// 1. softmax(logp) → stable-sort desc → pick up to `topK` ids until +/// cumulative mass ≥ `topP` +/// 2. multinomial draw within that candidate set +/// 3. if the drawn id appears in the last `winSize` decoded tokens at least +/// `winSize * tauR` times, mask it to -inf and re-sample across the full +/// vocab +/// +/// A `seedTokens` mode bypasses the RNG entirely — the sampler just emits the +/// pre-recorded Python token stream one id at a time. This is how the parity +/// harness bit-matches despite the `torch.multinomial` RNG mismatch between +/// PyTorch and Swift. +public final class CosyVoice3RasSampler { + + public let topP: Float + public let topK: Int + public let winSize: Int + public let tauR: Float + public let vocabSize: Int + + private var rng: SeedableRng + private var seedQueue: [Int32] + private var seedIdx: Int = 0 + + public init( + topP: Float = CosyVoice3Constants.topP, + topK: Int = CosyVoice3Constants.topK, + winSize: Int = CosyVoice3Constants.rasWindow, + tauR: Float = CosyVoice3Constants.rasTauR, + vocabSize: Int = CosyVoice3Constants.speechVocab, + seed: UInt64 = 42 + ) { + self.topP = topP + self.topK = topK + self.winSize = winSize + self.tauR = tauR + self.vocabSize = vocabSize + self.rng = SeedableRng(seed: seed) + self.seedQueue = [] + } + + /// Pre-load a token stream to replay (for parity harness). + public func seedTokens(_ tokens: [Int32]) { + self.seedQueue = tokens + self.seedIdx = 0 + } + + /// Given `logits` of shape `[vocabSize]`, return the sampled token id. + /// `decodedSoFar` is the running decoded stream for repetition checking. + public func sample(logits: [Float], decodedSoFar: [Int32]) -> Int32 { + // Seeded parity replay bypasses sampling. + if seedIdx < seedQueue.count { + let id = seedQueue[seedIdx] + seedIdx += 1 + return id + } + precondition(logits.count == vocabSize, "logits count must match vocabSize") + + // Pass 1: nucleus sampling. + let probs = logits.softmax() + let top = nucleus(probs: probs) + var sampled = top + + // Pass 2: repetition mask. + let windowStart = max(0, decodedSoFar.count - winSize) + let recent = decodedSoFar[windowStart..= Float(winSize) * tauR { + var masked = probs + masked[Int(sampled)] = 0 + // Re-normalize + multinomial across full vocab. + let sum = masked.reduce(0, +) + if sum > 0 { + for i in 0.. Int32 { + // Stable sort descending with index. + let sorted = probs.enumerated().sorted { + if $0.element != $1.element { return $0.element > $1.element } + return $0.offset < $1.offset + } + var cum: Float = 0 + var selIdx: [Int] = [] + var selProb: [Float] = [] + for entry in sorted { + if cum < topP && selProb.count < topK { + cum += entry.element + selProb.append(entry.element) + selIdx.append(entry.offset) + } else { + break + } + } + // Normalize selected candidates and multinomial pick. + let sum = selProb.reduce(0, +) + guard sum > 0 else { return Int32(selIdx.first ?? 0) } + for i in 0.. Int32 { + let u = rng.nextFloat() + var cum: Float = 0 + for (i, p) in probs.enumerated() { + cum += p + if u < cum { return Int32(i) } + } + return Int32(probs.count - 1) + } + + private func multinomialInSet(probs: [Float], ids: [Int]) -> Int { + let u = rng.nextFloat() + var cum: Float = 0 + for (j, p) in probs.enumerated() { + cum += p + if u < cum { return ids[j] } + } + return ids.last ?? 0 + } +} + +// MARK: - Simple deterministic RNG + +/// Linear-congruential PRNG wrapping SplitMix64. Used only as a fallback when +/// parity replay isn't active; the parity harness seeds an explicit token list +/// to dodge `torch.multinomial` divergence. +private struct SeedableRng { + private var state: UInt64 + init(seed: UInt64) { self.state = seed == 0 ? 0xdead_beef : seed } + mutating func nextUInt64() -> UInt64 { + state &+= 0x9E37_79B9_7F4A_7C15 + var z = state + z = (z ^ (z >> 30)) &* 0xBF58_476D_1CE4_E5B9 + z = (z ^ (z >> 27)) &* 0x94D0_49BB_1331_11EB + return z ^ (z >> 31) + } + mutating func nextFloat() -> Float { + // 24-bit mantissa → [0, 1) + let bits = UInt32(truncatingIfNeeded: nextUInt64() >> 40) + return Float(bits) / Float(1 << 24) + } +} + +// MARK: - Array softmax + +extension Array where Element == Float { + fileprivate func softmax() -> [Float] { + guard let m = self.max() else { return self } + var exps = [Float](repeating: 0, count: self.count) + var sum: Float = 0 + for i in 0.. 0 { + for i in 0.. MLMultiArray { + let array = try MLMultiArray( + shape: [1, 1, NSNumber(value: embedDim)], + dataType: .float32) + try copyEmbedding(tokenId: tokenId, into: array) + return array + } + + /// Copy the fp16 embedding row for `tokenId` into an existing + /// `[1, 1, embedDim]` fp32 MLMultiArray. Avoids the per-step allocation + /// of `embedding(tokenId:)` in the hot decode loop. + public func copyEmbedding(tokenId: Int32, into array: MLMultiArray) throws { + guard tokenId >= 0 && Int(tokenId) < numTokens else { + throw CosyVoice3Error.invalidShape( + "speech token id \(tokenId) out of range [0, \(numTokens))") + } + let rowStart = Int(tokenId) * rowByteSize + let dim = embedDim + let lastStride = array.strides.last?.intValue ?? 1 + tableBytes.withUnsafeBytes { src in + let basePtr = src.baseAddress!.advanced(by: rowStart) + let fp16Ptr = basePtr.assumingMemoryBound(to: Float16.self) + let dstPtr = array.dataPointer.bindMemory(to: Float.self, capacity: array.count) + for i in 0.. CosyVoice3SynthesisResult { + + let nPrompt = fixture.promptSpeechIds.count + let roomForNew = CosyVoice3Constants.flowTotalTokens - nPrompt + guard roomForNew > 0 else { + throw CosyVoice3Error.sequenceTooLong(nPrompt) + } + let maxNew: Int = { + if let cap = options.maxNewTokens, cap > 0 { return min(cap, roomForNew) } + return roomForNew + }() + + // Sampler. Parity harness seeds the Python-recorded decode stream. + let sampler = CosyVoice3RasSampler(seed: options.seed) + if options.replayDecodedTokens { + sampler.seedTokens(fixture.decodedTokens) + } + + // 1) Prefill (non-stateful: returns kv_k / kv_v as outputs) + let tPrefill = Date() + let (prefillLogits, initialKvK, initialKvV) = try await runPrefill(fixture: fixture) + let prefillSec = Date().timeIntervalSince(tPrefill) + + // Seed decode MLState from prefill kv_k / kv_v. + let tSeed = Date() + let state = models.decode.makeState() + try seedDecodeState(state: state, kvK: initialKvK, kvV: initialKvV) + let seedSec = Date().timeIntervalSince(tSeed) + + // Reusable per-step inputs for decode. `curLenArr` is mutated in place + // each step; `inputsEmbedsArr` is overwritten by memcpy per step. + let curLenArr = try MLMultiArray(shape: [1], dataType: .int32) + let inputsEmbedsArr = try MLMultiArray( + shape: [1, 1, NSNumber(value: CosyVoice3Constants.embedDim)], + dataType: .float32) + + // First token from prefill tail logits. + var decoded: [Int32] = [] + let firstLogits = sliceLastStepLogits( + from: prefillLogits, + tPre: fixture.tPre, + vocab: CosyVoice3Constants.speechVocab) + var topId = sampler.sample(logits: firstLogits, decodedSoFar: decoded) + if CosyVoice3Constants.stopRange.contains(topId) { + // Prefill emitted EOS at step 0 — the LLM signaled "no speech". + // Bail out instead of feeding the stop-token embedding into the + // decode loop (which would accumulate semantically meaningless + // tokens into `decoded`). + logger.info("First token \(topId) is a stop token; no speech generated") + throw CosyVoice3Error.predictionFailed("LLM produced no speech tokens") + } + decoded.append(topId) + + // 2) Decode loop + var curLen = fixture.tPre + var decodeSteps = 0 + let tDecode = Date() + for step in 1.. 0 ? Double(decodeSteps) / decodeSec : 0 + logger.info( + String( + format: + "STAGES prefill=%.3fs seed=%.3fs decode=%.3fs(%d steps, %.2f tok/s) flow=%.3fs hift=%.3fs", + prefillSec, seedSec, decodeSec, decodeSteps, decodeTps, flowSec, hiftSec)) + + return CosyVoice3SynthesisResult( + samples: audio, + sampleRate: CosyVoice3Constants.sampleRate, + generatedTokenCount: nNew, + decodedTokens: decoded) + } + + // MARK: - Stages + + private func runPrefill( + fixture: CosyVoice3FrontendFixture + ) async throws -> (logits: MLMultiArray, kvK: MLMultiArray, kvV: MLMultiArray) { + guard fixture.tPre <= CosyVoice3Constants.prefillLength else { + throw CosyVoice3Error.prefillTooLong(fixture.tPre) + } + // Pad lm_input_embeds from [1, tPre, 896] to [1, 256, 896]. + // Strides may be non-compact (e.g. [T*D_padded, D_padded, 1]). + let embeds = try MLMultiArray( + shape: [ + 1, + NSNumber(value: CosyVoice3Constants.prefillLength), + NSNumber(value: CosyVoice3Constants.embedDim), + ], + dataType: .float32) + let embedDim = CosyVoice3Constants.embedDim + let embedsStrides = embeds.strides.map { $0.intValue } + let dst = embeds.dataPointer.bindMemory(to: Float.self, capacity: embeds.count) + let physicalCount = embedsStrides[0] * embeds.shape[0].intValue + dst.initialize(repeating: 0, count: physicalCount) + for t in 0.. [Float] { + let features: [String: Any] = [ + "inputs_embeds": inputsEmbeds, + "cur_len": curLen, + ] + let provider = try MLDictionaryFeatureProvider(dictionary: features) + let output = try models.decode.prediction(from: provider, using: state) + + guard + let logitsArr = output.featureValue(for: "speech_logits")?.multiArrayValue + else { + throw CosyVoice3Error.predictionFailed("decode: missing speech_logits") + } + // logits shape = [1, 1, 6761] fp32; strides may be non-compact. + let count = CosyVoice3Constants.speechVocab + var logits = [Float](repeating: 0, count: count) + let strides = logitsArr.strides.map { $0.intValue } + let vocabStride = strides.last ?? 1 + let base = logitsArr.dataPointer.bindMemory(to: Float.self, capacity: logitsArr.count) + for i in 0.., + srcLayerBase: Int, + srcHStride: Int, srcMStride: Int, srcDStride: Int, + dst: UnsafeMutablePointer, + dstHStride: Int, dstMStride: Int, dstDStride: Int, + H: Int, M: Int, D: Int + ) { + for h in 0.. (mel: MLMultiArray, numPromptMel: Int) { + let N = CosyVoice3Constants.flowTotalTokens + let nPrompt = promptSpeechIds.count + let nNew = decodedTokens.count + let nTotal = nPrompt + nNew + guard nTotal <= N else { + throw CosyVoice3Error.sequenceTooLong(nTotal) + } + // token_total: [1, 250] int32, zero-padded. Respect strides. + let tokenTotal = try MLMultiArray( + shape: [1, NSNumber(value: N)], + dataType: .int32) + let ttStrides = tokenTotal.strides.map { $0.intValue } + let ttPtr = tokenTotal.dataPointer.bindMemory(to: Int32.self, capacity: tokenTotal.count) + let ttPhysical = ttStrides[0] * tokenTotal.shape[0].intValue + ttPtr.initialize(repeating: 0, count: ttPhysical) + for i in 0.. [Float] { + // fullMel logical shape = [1, 80, 500]. Physical strides may be + // non-compact (e.g. [40960, 512, 1]) — use logical indexing. + // Dtype depends on the Flow variant: the ANE-port Flow emits fp16 to + // keep the graph fp16 end-to-end; the prior cpuAndGPU Flow emits fp32. + // HiFT's `mel` input is always fp32 at the CoreML I/O boundary. + let hiftFrames = CosyVoice3Constants.hiftMaxFrames + let melBins = CosyVoice3Constants.melBins + // fullMel logical shape = [1, 80, totalMelFrames]. Clamp the valid + // window to the remaining frames after `newMelStart` so a slightly + // off `num_prompt_mel` from the Flow model can never cause an + // out-of-bounds read at `srcBase[newMelStart + f]`. + let totalMelFrames = fullMel.shape.count >= 3 ? fullMel.shape[2].intValue : hiftFrames + guard newMelStart >= 0 && newMelStart <= totalMelFrames else { + throw CosyVoice3Error.invalidShape( + "runHiFT: newMelStart=\(newMelStart) out of range [0, \(totalMelFrames)]") + } + let availableFrames = max(0, totalMelFrames - newMelStart) + let validFrames = min(newMelFrames, hiftFrames, availableFrames) + + let melInput = try MLMultiArray( + shape: [1, NSNumber(value: melBins), NSNumber(value: hiftFrames)], + dataType: .float32) + // melInput strides may also be non-compact — use logical indexing. + let melInputStrides = melInput.strides.map { $0.intValue } + let dstBase = melInput.dataPointer.bindMemory(to: Float.self, capacity: melInput.count) + // Zero-fill entire physical extent (handles padded strides). + let totalPhysical = melInputStrides[0] * melInput.shape[0].intValue + dstBase.initialize(repeating: 0, count: totalPhysical) + + let srcStrides = fullMel.strides.map { $0.intValue } + // fullMel logical: [1, 80, 500]; copy new slice → melInput [1, 80, 500]. + // Branch on src dtype so the fp16 ANE-port Flow output doesn't get + // reinterpreted as fp32 (would read past end of buffer → SIGSEGV). + switch fullMel.dataType { + case .float16: + let srcBase = fullMel.dataPointer.bindMemory( + to: Float16.self, capacity: fullMel.count) + for b in 0.. [Float] { + let strides = logits.strides.map { $0.intValue } + // shape = [1, T, V]; row (time) stride is strides[1], vocab stride is strides[2]. + let rowStride = strides[1] + let vocabStride = strides[2] + let ptr = logits.dataPointer.bindMemory(to: Float.self, capacity: logits.count) + let base = (tPre - 1) * rowStride + var out = [Float](repeating: 0, count: vocab) + for i in 0..": {"dtype": "...", "shape": [...], "data_offsets": [start, end]}, ... }` +/// - raw tensor payload (referenced by offsets above) +/// +/// Used for Phase 1 fixture + speech embedding table mmap. +public final class SafetensorsFile { + + public enum DType: String, Sendable { + case f16 = "F16" + case bf16 = "BF16" + case f32 = "F32" + case f64 = "F64" + case i8 = "I8" + case i16 = "I16" + case i32 = "I32" + case i64 = "I64" + case u8 = "U8" + case u16 = "U16" + case u32 = "U32" + case u64 = "U64" + case bool = "BOOL" + + public var byteSize: Int { + switch self { + case .f16, .bf16, .i16, .u16: return 2 + case .f32, .i32, .u32: return 4 + case .f64, .i64, .u64: return 8 + case .i8, .u8, .bool: return 1 + } + } + } + + public struct TensorInfo: Sendable { + public let dtype: DType + public let shape: [Int] + public let dataStart: Int // absolute offset in file + public let dataEnd: Int + public var byteCount: Int { dataEnd - dataStart } + } + + private let data: Data + private let payloadStart: Int + public let tensors: [String: TensorInfo] + + public init(url: URL) throws { + let data = try Data(contentsOf: url, options: [.alwaysMapped]) + guard data.count >= 8 else { + throw CosyVoice3Error.invalidSafetensors("file smaller than 8 byte header: \(url.path)") + } + self.data = data + + let headerLen: UInt64 = data.withUnsafeBytes { buf in + var v: UInt64 = 0 + memcpy(&v, buf.baseAddress!, 8) + return UInt64(littleEndian: v) + } + let headerEnd = 8 + Int(headerLen) + guard headerEnd <= data.count else { + throw CosyVoice3Error.invalidSafetensors( + "header length \(headerLen) exceeds file size \(data.count)") + } + let headerData = data.subdata(in: 8.. Data { + guard let info = tensors[name] else { + throw CosyVoice3Error.invalidSafetensors("tensor not found: \(name)") + } + return data.subdata(in: info.dataStart.. TensorInfo { + guard let info = tensors[name] else { + throw CosyVoice3Error.invalidSafetensors("tensor not found: \(name)") + } + return info + } + + // MARK: - Typed accessors (copying) + + public func asFloat32(_ name: String) throws -> [Float] { + let info = try self.info(name) + let bytes = try rawBytes(name) + switch info.dtype { + case .f32: + return bytes.withUnsafeBytes { buf -> [Float] in + let count = buf.count / 4 + let ptr = buf.bindMemory(to: Float.self) + return Array(UnsafeBufferPointer(start: ptr.baseAddress, count: count)) + } + case .f64: + return bytes.withUnsafeBytes { buf -> [Float] in + let count = buf.count / 8 + let ptr = buf.bindMemory(to: Double.self) + return (0.. [Int32] { + let info = try self.info(name) + let bytes = try rawBytes(name) + switch info.dtype { + case .i32: + return bytes.withUnsafeBytes { buf -> [Int32] in + let count = buf.count / 4 + let ptr = buf.bindMemory(to: Int32.self) + return Array(UnsafeBufferPointer(start: ptr.baseAddress, count: count)) + } + case .i64: + return bytes.withUnsafeBytes { buf -> [Int32] in + let count = buf.count / 8 + let ptr = buf.bindMemory(to: Int64.self) + return (0.. Int { + let values = try asInt32(name) + guard let first = values.first else { + throw CosyVoice3Error.invalidSafetensors("tensor \(name) is empty") + } + return Int(first) + } +} diff --git a/Sources/FluidAudio/TTS/TtsBackend.swift b/Sources/FluidAudio/TTS/TtsBackend.swift index 7a67049b9..aee95bbcf 100644 --- a/Sources/FluidAudio/TTS/TtsBackend.swift +++ b/Sources/FluidAudio/TTS/TtsBackend.swift @@ -6,6 +6,15 @@ public enum TtsBackend: Sendable { case kokoro /// PocketTTS — flow-matching language model, autoregressive streaming synthesis. case pocketTts + /// CosyVoice3 — Mandarin zero-shot voice cloning via Qwen2 LM + Flow CFM + HiFT. + /// + /// > Note: **Experimental / beta.** End-to-end synthesis is currently + /// > slow (RTFx < 1.0 typical on Apple Silicon). Cause is partially + /// > in the Flow CFM stage which must run fp32 on CPU/GPU (fp16 + ANE + /// > produces NaNs through fused `layer_norm`) and partially in HiFT + /// > sinegen ops that fall back to CPU. May be a model issue, may be + /// > recoverable via better conversion — treat as preliminary. + case cosyvoice3 /// laishere/kokoro 7-stage CoreML chain (ALBERT → PostAlbert → Alignment → /// Prosody → Noise → Vocoder → Tail) with per-stage ANE/GPU assignment. case kokoroAne diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3/FrontendParityCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3/FrontendParityCommand.swift new file mode 100644 index 000000000..70520a561 --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/CosyVoice3/FrontendParityCommand.swift @@ -0,0 +1,146 @@ +import CoreML +import FluidAudio +import Foundation + +/// Phase 2 text-frontend parity harness. +/// +/// Loads `shipping.safetensors` (expected `lm_input_embeds`, `llm_prompt_speech_ids`) +/// plus its JSON sidecar (`prompt_text`, `tts_text`), tokenizes the text via +/// `Qwen2BpeTokenizer`, assembles via `CosyVoice3TextFrontend`, and compares +/// element-wise against the fixture. +/// +/// Usage: +/// ``` +/// fluidaudio tts --backend cosyvoice3-frontend-parity \ +/// --tokenizer-dir .../cosyvoice3_dl/CosyVoice-BlankEN \ +/// --embeddings-file .../build/embeddings/embeddings-fp32.safetensors \ +/// --fixture .../build/frontend/shipping.safetensors \ +/// --tok-fixture .../build/frontend/tokenizer_fixture.json +/// ``` +enum CosyVoice3FrontendParityCLI { + + private static let logger = AppLogger(category: "CosyVoice3FrontendParityCLI") + + static func run( + tokenizerDir: String, + embeddingsFile: String, + fixturePath: String, + tokFixturePath: String + ) async { + let tokURL = URL( + fileURLWithPath: (tokenizerDir as NSString).expandingTildeInPath, + isDirectory: true) + let embURL = URL(fileURLWithPath: (embeddingsFile as NSString).expandingTildeInPath) + let fixURL = URL(fileURLWithPath: (fixturePath as NSString).expandingTildeInPath) + let tokFixURL = URL(fileURLWithPath: (tokFixturePath as NSString).expandingTildeInPath) + let sidecarURL = fixURL.deletingPathExtension().appendingPathExtension("json") + + struct TokFix: Decodable { + let special_tokens: [String: Int32] + } + struct Sidecar: Decodable { + let prompt_text: String + let tts_text: String + } + + do { + let tokFix = try JSONDecoder().decode( + TokFix.self, from: try Data(contentsOf: tokFixURL)) + let sidecar = try JSONDecoder().decode( + Sidecar.self, from: try Data(contentsOf: sidecarURL)) + + let tStart = Date() + let tokenizer = try Qwen2BpeTokenizer.load( + directory: tokURL, specialTokens: tokFix.special_tokens) + let embeddings = try CosyVoice3TextEmbeddings(url: embURL) + logger.info( + "Loaded tokenizer + text_embedding table in \(String(format: "%.2fs", Date().timeIntervalSince(tStart)))" + ) + + let fixture = try CosyVoice3FrontendFixture.load(from: fixURL) + logger.info("Fixture: T_pre=\(fixture.tPre) N_prompt_speech=\(fixture.promptSpeechIds.count)") + + let frontend = CosyVoice3TextFrontend(tokenizer: tokenizer, embeddings: embeddings) + let assembled = try frontend.assemble( + promptText: sidecar.prompt_text, + ttsText: sidecar.tts_text, + promptSpeechIds: fixture.promptSpeechIds) + + print("") + print(" swift T_pre : \(assembled.tPre)") + print(" fixture T_pre : \(fixture.tPre)") + + guard assembled.tPre == fixture.tPre else { + print("T_pre mismatch — tokenization diverged.") + exit(1) + } + + // Element-wise comparison: fixture is compact fp32, swift array + // may have padded strides. + let dim = CosyVoice3Constants.embedDim + let strides = assembled.lmInputEmbeds.strides.map { $0.intValue } + let ptr = assembled.lmInputEmbeds.dataPointer.bindMemory( + to: Float.self, capacity: assembled.lmInputEmbeds.count) + var maxAbs: Double = 0 + var maxAt: (t: Int, d: Int) = (0, 0) + var sumAbs: Double = 0 + var rowMax = [Double](repeating: 0, count: assembled.tPre) + let n = assembled.tPre * dim + for t in 0.. rowMax[t] { rowMax[t] = a } + if a > maxAbs { + maxAbs = a + maxAt = (t, d) + } + } + } + let mae = sumAbs / Double(n) + print(" MAE : \(String(format: "%.6e", mae))") + print(" max|Δ| : \(String(format: "%.6e", maxAbs)) at (t=\(maxAt.t), d=\(maxAt.d))") + + // Show the top-5 worst rows to see if divergence is concentrated + // at sos (t=0), task_id (t=1+nText), or specific text/speech rows. + let N_speech = fixture.promptSpeechIds.count + let nText = assembled.tPre - 2 - N_speech + print( + " layout : sos@0 text@1..\(nText) task@\(1 + nText) speech@\(2 + nText)..\(assembled.tPre - 1)" + ) + let ranked = rowMax.enumerated().sorted { $0.element > $1.element }.prefix(5) + print(" top rows:") + for (t, m) in ranked { + let slot: String + if t == 0 { + slot = "sos" + } else if t == 1 + nText { + slot = "task_id" + } else if t < 1 + nText { + slot = "text[\(t - 1)]" + } else { + slot = "speech[\(t - 2 - nText)]" + } + print( + " t=\(t) \(slot.padding(toLength: 12, withPad: " ", startingAt: 0)) max|Δ|=\(String(format: "%.6e", m))" + ) + } + + // Compare Swift's reconstructed token ids for sanity. + print(" swift textToken ids (first 10): \(assembled.textTokenIds.prefix(10).map { $0 })") + print(" swift textToken ids (last 5) : \(assembled.textTokenIds.suffix(5).map { $0 })") + + if maxAbs > 1e-4 { + print("parity tolerance exceeded (max|Δ| > 1e-4)") + exit(1) + } + print("frontend parity OK") + } catch { + logger.error("Frontend parity failed: \(error)") + exit(2) + } + } +} diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3/ParityCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3/ParityCommand.swift new file mode 100644 index 000000000..020a10f0c --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/CosyVoice3/ParityCommand.swift @@ -0,0 +1,203 @@ +import CoreML +import FluidAudio +import Foundation + +/// Phase 1 parity harness CLI for the CosyVoice3 Swift port. +/// +/// Usage: +/// ``` +/// fluidaudio tts --backend cosyvoice3-parity \ +/// --fixture .../build/frontend/shipping.safetensors \ +/// --models-dir .../coreml/build \ +/// --reference .../build/wavs/e2e_shipping.wav \ +/// --output .../build/swift_e2e.wav \ +/// --seed 42 +/// ``` +@available(macOS 15, iOS 18, *) +enum CosyVoice3ParityCLI { + + private static let logger = AppLogger(category: "CosyVoice3ParityCLI") + + static func run( + fixturePath: String, + modelsDir: String, + referencePath: String?, + outputPath: String, + seed: UInt64, + cpuOnly: Bool, + replayTokens: Bool + ) async { + let fixtureURL = URL(fileURLWithPath: (fixturePath as NSString).expandingTildeInPath) + let modelsURL = URL( + fileURLWithPath: (modelsDir as NSString).expandingTildeInPath, isDirectory: true) + let outputURL = URL(fileURLWithPath: (outputPath as NSString).expandingTildeInPath) + + let computeUnits: MLComputeUnits = cpuOnly ? .cpuOnly : .cpuAndNeuralEngine + let manager = CosyVoice3TtsManager(directory: modelsURL, computeUnits: computeUnits) + + do { + let tLoad = Date() + try await manager.initialize() + logger.info( + "Loaded CosyVoice3 models in \(String(format: "%.2f", Date().timeIntervalSince(tLoad)))s" + ) + + let options = CosyVoice3ParityOptions( + maxNewTokens: nil, seed: seed, replayDecodedTokens: replayTokens) + + let tSynth = Date() + let result = try await manager.synthesizeFromFixture( + fixtureURL: fixtureURL, options: options) + let synthElapsed = Date().timeIntervalSince(tSynth) + let audioSec = Double(result.samples.count) / Double(result.sampleRate) + let rtfx = audioSec / synthElapsed + logger.info( + "Synthesized \(result.samples.count) samples (\(String(format: "%.2fs", audioSec))) in \(String(format: "%.2fs", synthElapsed))" + ) + print( + String( + format: + "RTFX audio=%.3fs synth=%.3fs RTFx=%.3fx tokens=%d", + audioSec, synthElapsed, rtfx, result.generatedTokenCount)) + + try writeWAV(samples: result.samples, sampleRate: result.sampleRate, to: outputURL) + logger.info("Wrote WAV: \(outputURL.path)") + + if let refPath = referencePath { + let refURL = URL( + fileURLWithPath: (refPath as NSString).expandingTildeInPath) + let refSamples = try readWAVMono(url: refURL) + let metrics = compareWaveforms( + swift: result.samples, reference: refSamples) + print("") + print( + " reference samples : \(refSamples.count) swift samples : \(result.samples.count)" + ) + print( + " MAE : \(String(format: "%.6f", metrics.mae))") + print( + " max|Δ| : \(String(format: "%.6f", metrics.maxAbsDiff))") + print(" SNR : \(String(format: "%.2f dB", metrics.snrDb))") + if metrics.maxAbsDiff > 1e-3 { + logger.warning( + "Parity tolerance exceeded: max|Δ|=\(metrics.maxAbsDiff) > 1e-3") + exit(1) + } + } + } catch { + logger.error("CosyVoice3 parity harness failed: \(error)") + exit(2) + } + } + + // MARK: - WAV IO (un-normalized) + + private static func writeWAV(samples: [Float], sampleRate: Int, to url: URL) throws { + // Clamp to [-1, 1] to avoid int16 overflow; do NOT rescale to max=1. + let numSamples = samples.count + let byteRate = sampleRate * 2 + let dataSize = numSamples * 2 + var header = Data() + header.append("RIFF".data(using: .ascii)!) + header.appendUInt32LE(UInt32(36 + dataSize)) + header.append("WAVE".data(using: .ascii)!) + header.append("fmt ".data(using: .ascii)!) + header.appendUInt32LE(16) + header.appendUInt16LE(1) // PCM + header.appendUInt16LE(1) // mono + header.appendUInt32LE(UInt32(sampleRate)) + header.appendUInt32LE(UInt32(byteRate)) + header.appendUInt16LE(2) // block align + header.appendUInt16LE(16) // bits/sample + header.append("data".data(using: .ascii)!) + header.appendUInt32LE(UInt32(dataSize)) + + var pcm = Data(capacity: dataSize) + for s in samples { + let clipped = max(-1.0, min(1.0, s)) + let i16 = Int16(clipped * 32_767.0) + var le = i16.littleEndian + Swift.withUnsafeBytes(of: &le) { pcm.append(contentsOf: $0) } + } + try (header + pcm).write(to: url) + } + + private static func readWAVMono(url: URL) throws -> [Float] { + let data = try Data(contentsOf: url) + guard data.count > 44 else { + throw CocoaError(.fileReadCorruptFile) + } + // Find 'data' chunk. + var offset = 12 + var dataStart = -1 + var dataSize = 0 + while offset + 8 <= data.count { + let id = data.subdata(in: offset.. 0 else { throw CocoaError(.fileReadCorruptFile) } + let pcm = data.subdata(in: dataStart.. WaveformMetrics { + let n = min(swift.count, reference.count) + guard n > 0 else { return WaveformMetrics(mae: .infinity, maxAbsDiff: .infinity, snrDb: -.infinity) } + var sumAbs: Double = 0 + var maxAbs: Double = 0 + var sumSigSq: Double = 0 + var sumErrSq: Double = 0 + for i in 0.. maxAbs { maxAbs = a } + sumSigSq += Double(reference[i]) * Double(reference[i]) + sumErrSq += diff * diff + } + let snr = sumErrSq > 0 ? 10 * log10(sumSigSq / sumErrSq) : .infinity + return WaveformMetrics(mae: sumAbs / Double(n), maxAbsDiff: maxAbs, snrDb: snr) + } +} + +// MARK: - Data helpers + +extension Data { + fileprivate mutating func appendUInt32LE(_ v: UInt32) { + var le = v.littleEndian + Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) } + } + fileprivate mutating func appendUInt16LE(_ v: UInt16) { + var le = v.littleEndian + Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) } + } + fileprivate func readUInt32LE() -> UInt32 { + self.withUnsafeBytes { buf -> UInt32 in + var v: UInt32 = 0 + memcpy(&v, buf.baseAddress!, 4) + return UInt32(littleEndian: v) + } + } +} diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3/TextCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3/TextCommand.swift new file mode 100644 index 000000000..cbf64d92d --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/CosyVoice3/TextCommand.swift @@ -0,0 +1,136 @@ +import CoreML +import FluidAudio +import Foundation + +/// Phase 2 text-driven synthesis CLI for the CosyVoice3 Swift port. +/// +/// Drives `CosyVoice3TtsManager.synthesize(text:promptAssets:options:)` end +/// to end: tokenizer + frontend + LLM + Flow + HiFT, writing a 24 kHz WAV. +/// +/// Usage: +/// ``` +/// fluidaudio tts --backend cosyvoice3-text \ +/// --text "希望你以后能够做的比我还好用" \ +/// --models-dir .../coreml/build \ +/// --tokenizer-dir .../cosyvoice3_dl/CosyVoice-BlankEN \ +/// --embeddings-file .../build/embeddings/embeddings-runtime-fp32.safetensors \ +/// --special-tokens-file .../build/frontend/tokenizer_fixture.json \ +/// --prompt-assets .../build/frontend/shipping.safetensors \ +/// --output .../build/swift_cv3_text.wav \ +/// --seed 42 +/// ``` +@available(macOS 15, iOS 18, *) +enum CosyVoice3TextCLI { + + private static let logger = AppLogger(category: "CosyVoice3TextCLI") + + static func run( + text: String, + modelsDir: String, + tokenizerDir: String, + embeddingsFile: String, + specialTokensFile: String, + promptAssetsPath: String, + outputPath: String, + seed: UInt64, + maxNewTokens: Int?, + cpuOnly: Bool + ) async { + let modelsURL = URL( + fileURLWithPath: (modelsDir as NSString).expandingTildeInPath, isDirectory: true) + let tokURL = URL( + fileURLWithPath: (tokenizerDir as NSString).expandingTildeInPath, isDirectory: true) + let embURL = URL(fileURLWithPath: (embeddingsFile as NSString).expandingTildeInPath) + let specURL = URL(fileURLWithPath: (specialTokensFile as NSString).expandingTildeInPath) + let promptURL = URL(fileURLWithPath: (promptAssetsPath as NSString).expandingTildeInPath) + let outputURL = URL(fileURLWithPath: (outputPath as NSString).expandingTildeInPath) + + let computeUnits: MLComputeUnits = cpuOnly ? .cpuOnly : .cpuAndNeuralEngine + let manager = CosyVoice3TtsManager( + modelsDirectory: modelsURL, + tokenizerDirectory: tokURL, + textEmbeddingsFile: embURL, + specialTokensFile: specURL, + computeUnits: computeUnits) + + do { + let tLoad = Date() + try await manager.initialize() + logger.info( + "Loaded CosyVoice3 models + frontend in \(String(format: "%.2f", Date().timeIntervalSince(tLoad)))s" + ) + + let tPrompt = Date() + let promptAssets = try CosyVoice3PromptAssets.load(from: promptURL) + logger.info( + "Loaded prompt assets in \(String(format: "%.2f", Date().timeIntervalSince(tPrompt)))s — N_speech=\(promptAssets.promptSpeechIds.count), mel_frames=\(promptAssets.promptMelFrames)" + ) + + let options = CosyVoice3SynthesisOptions( + maxNewTokens: maxNewTokens, seed: seed) + + let tSynth = Date() + let result = try await manager.synthesize( + text: text, promptAssets: promptAssets, options: options) + let synthElapsed = Date().timeIntervalSince(tSynth) + let audioSecs = Double(result.samples.count) / Double(result.sampleRate) + let rtfx = synthElapsed > 0 ? audioSecs / synthElapsed : 0 + logger.info( + "Synthesized \(result.samples.count) samples (\(String(format: "%.2fs", audioSecs))) in \(String(format: "%.2fs", synthElapsed)) — RTFx \(String(format: "%.2fx", rtfx))" + ) + logger.info("Generated \(result.generatedTokenCount) speech tokens") + + try FileManager.default.createDirectory( + at: outputURL.deletingLastPathComponent(), + withIntermediateDirectories: true) + try writeWAV(samples: result.samples, sampleRate: result.sampleRate, to: outputURL) + logger.info("Wrote WAV: \(outputURL.path)") + } catch { + logger.error("CosyVoice3 text synthesis failed: \(error)") + exit(2) + } + } + + private static func writeWAV(samples: [Float], sampleRate: Int, to url: URL) throws { + let numSamples = samples.count + let byteRate = sampleRate * 2 + let dataSize = numSamples * 2 + var header = Data() + header.append("RIFF".data(using: .ascii)!) + header.appendUInt32LE(UInt32(36 + dataSize)) + header.append("WAVE".data(using: .ascii)!) + header.append("fmt ".data(using: .ascii)!) + header.appendUInt32LE(16) + header.appendUInt16LE(1) // PCM + header.appendUInt16LE(1) // mono + header.appendUInt32LE(UInt32(sampleRate)) + header.appendUInt32LE(UInt32(byteRate)) + header.appendUInt16LE(2) // block align + header.appendUInt16LE(16) // bits/sample + header.append("data".data(using: .ascii)!) + header.appendUInt32LE(UInt32(dataSize)) + + var pcm = Data(capacity: dataSize) + for s in samples { + let clipped = max(-1.0, min(1.0, s)) + let i16 = Int16(clipped * 32_767.0) + var le = i16.littleEndian + Swift.withUnsafeBytes(of: &le) { pcm.append(contentsOf: $0) } + } + try (header + pcm).write(to: url) + } +} + +// MARK: - Data helpers (file-scoped duplicate of the helpers in +// CosyVoice3ParityCommand.swift; kept here so this file compiles on its own). + +extension Data { + fileprivate mutating func appendUInt32LE(_ v: UInt32) { + var le = v.littleEndian + Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) } + } + fileprivate mutating func appendUInt16LE(_ v: UInt16) { + var le = v.littleEndian + Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) } + } +} diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3/TokenizerParityCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3/TokenizerParityCommand.swift new file mode 100644 index 000000000..d5550c60c --- /dev/null +++ b/Sources/FluidAudioCLI/Commands/CosyVoice3/TokenizerParityCommand.swift @@ -0,0 +1,70 @@ +import FluidAudio +import Foundation + +/// Phase 2 tokenizer parity harness. +/// +/// Loads the Python-exported tokenizer_fixture.json (special token map + test +/// cases) and asserts the Swift Qwen2BpeTokenizer produces the same ID stream +/// for every case. +/// +/// Usage: +/// ``` +/// fluidaudio tts --backend cosyvoice3-tokenizer-parity \ +/// --tokenizer-dir .../cosyvoice3_dl/CosyVoice-BlankEN \ +/// --fixture .../build/frontend/tokenizer_fixture.json +/// ``` +enum CosyVoice3TokenizerParityCLI { + + private static let logger = AppLogger(category: "CosyVoice3TokenizerParityCLI") + + static func run(tokenizerDir: String, fixturePath: String) async { + let tokURL = URL( + fileURLWithPath: (tokenizerDir as NSString).expandingTildeInPath, + isDirectory: true) + let fixURL = URL(fileURLWithPath: (fixturePath as NSString).expandingTildeInPath) + + struct Fixture: Decodable { + let special_tokens: [String: Int32] + let cases: [Case] + struct Case: Decodable { + let text: String + let ids: [Int32] + } + } + + do { + let data = try Data(contentsOf: fixURL) + let fixture = try JSONDecoder().decode(Fixture.self, from: data) + let tokenizer = try Qwen2BpeTokenizer.load( + directory: tokURL, specialTokens: fixture.special_tokens) + + var passed = 0 + var failed = 0 + var firstFail: (String, [Int32], [Int32])? = nil + for tc in fixture.cases { + let got = tokenizer.encode(tc.text) + if got == tc.ids { + passed += 1 + } else { + failed += 1 + if firstFail == nil { + firstFail = (tc.text, tc.ids, got) + } + } + } + + print("cases: \(passed + failed) passed: \(passed) failed: \(failed)") + if let (text, expected, got) = firstFail { + print("") + print("first mismatch:") + print(" text : \(text.debugDescription)") + print(" expected : \(expected)") + print(" got : \(got)") + } + if failed > 0 { exit(1) } + } catch { + logger.error("Tokenizer parity failed: \(error)") + exit(2) + } + } +} diff --git a/Sources/FluidAudioCLI/Commands/TTSCommand.swift b/Sources/FluidAudioCLI/Commands/TTSCommand.swift index 0b1c781d0..132fcc8d0 100644 --- a/Sources/FluidAudioCLI/Commands/TTSCommand.swift +++ b/Sources/FluidAudioCLI/Commands/TTSCommand.swift @@ -137,6 +137,25 @@ public struct TTS { var cloneVoicePath: String? = nil var voiceFilePath: String? = nil var saveVoicePath: String? = nil + // CosyVoice3 Phase 1 parity harness args. + var cv3FixturePath: String? = nil + var cv3ModelsDir: String? = nil + var cv3ReferencePath: String? = nil + var cv3Seed: UInt64 = 42 + var cv3CpuOnly: Bool = false + var cv3ReplayTokens: Bool = true + // CosyVoice3 Phase 2 tokenizer parity args. + var cv3TokenizerDir: String? = nil + var cv3TokenizerParityMode: Bool = false + // CosyVoice3 Phase 2 frontend parity args. + var cv3FrontendParityMode: Bool = false + var cv3EmbeddingsFile: String? = nil + var cv3TokFixturePath: String? = nil + // CosyVoice3 Phase 2 text-driven synthesis args. + var cv3TextMode: Bool = false + var cv3SpecialTokensFile: String? = nil + var cv3PromptAssetsPath: String? = nil + var cv3MaxNewTokens: Int? = nil var pocketLanguage: PocketTtsLanguage = .english // PocketTTS deterministic-seed mode (uses session API for fixed RNG). var pocketSeed: UInt64? = nil @@ -194,6 +213,22 @@ public struct TTS { backend = .kokoro case "pocket", "pockettts": backend = .pocketTts + case "cosyvoice3", "cv3", "cosyvoice3-text", "cv3-text": + // Production text-driven synthesis is the default + // user-facing path. The explicit `*-text` aliases + // are kept for backward compatibility with earlier + // documentation. + backend = .cosyvoice3 + cv3TextMode = true + case "cosyvoice3-parity", "cv3-parity": + // Phase 1 fixture parity harness — opt-in dev mode. + backend = .cosyvoice3 + case "cosyvoice3-tokenizer-parity", "cv3-tokenizer": + backend = .cosyvoice3 + cv3TokenizerParityMode = true + case "cosyvoice3-frontend-parity", "cv3-frontend": + backend = .cosyvoice3 + cv3FrontendParityMode = true case "kokoro-ane", "kokoroane", "lai": backend = .kokoroAne default: @@ -201,6 +236,65 @@ public struct TTS { } i += 1 } + case "--fixture": + if i + 1 < arguments.count { + cv3FixturePath = arguments[i + 1] + i += 1 + } + case "--models-dir": + if i + 1 < arguments.count { + cv3ModelsDir = arguments[i + 1] + i += 1 + } + case "--reference": + if i + 1 < arguments.count { + cv3ReferencePath = arguments[i + 1] + i += 1 + } + case "--seed": + if i + 1 < arguments.count { + cv3Seed = UInt64(arguments[i + 1]) ?? 42 + i += 1 + } + case "--cpu-only": + cv3CpuOnly = true + case "--no-replay": + cv3ReplayTokens = false + case "--tokenizer-dir": + if i + 1 < arguments.count { + cv3TokenizerDir = arguments[i + 1] + i += 1 + } + case "--embeddings-file": + if i + 1 < arguments.count { + cv3EmbeddingsFile = arguments[i + 1] + i += 1 + } + case "--tok-fixture": + if i + 1 < arguments.count { + cv3TokFixturePath = arguments[i + 1] + i += 1 + } + case "--special-tokens-file": + if i + 1 < arguments.count { + cv3SpecialTokensFile = arguments[i + 1] + i += 1 + } + case "--prompt-assets": + if i + 1 < arguments.count { + cv3PromptAssetsPath = arguments[i + 1] + i += 1 + } + case "--text": + if i + 1 < arguments.count { + text = arguments[i + 1] + i += 1 + } + case "--max-new-tokens": + if i + 1 < arguments.count { + cv3MaxNewTokens = Int(arguments[i + 1]) + i += 1 + } case "--auto-download": // No-op: downloads are always ensured by the CLI. Accepted // for backward compatibility with documented examples. @@ -267,6 +361,101 @@ public struct TTS { return } + if backend == .cosyvoice3 { + logger.warning( + "CosyVoice3 backend is experimental / beta — synthesis is " + + "slow (RTFx < 1.0 typical). Performance may improve in " + + "later releases.") + } + + if backend == .cosyvoice3 && cv3TokenizerParityMode { + guard let tokDir = cv3TokenizerDir, let fixture = cv3FixturePath else { + logger.error( + "cosyvoice3-tokenizer-parity requires --tokenizer-dir <.../CosyVoice-BlankEN> and --fixture " + ) + return + } + await CosyVoice3TokenizerParityCLI.run( + tokenizerDir: tokDir, fixturePath: fixture) + return + } + + if backend == .cosyvoice3 && cv3FrontendParityMode { + guard + let tokDir = cv3TokenizerDir, + let embFile = cv3EmbeddingsFile, + let fixture = cv3FixturePath, + let tokFix = cv3TokFixturePath + else { + logger.error( + "cosyvoice3-frontend-parity requires --tokenizer-dir, --embeddings-file, --fixture , --tok-fixture" + ) + return + } + await CosyVoice3FrontendParityCLI.run( + tokenizerDir: tokDir, + embeddingsFile: embFile, + fixturePath: fixture, + tokFixturePath: tokFix) + return + } + + if backend == .cosyvoice3 && cv3TextMode { + guard + let inputText = text, + let modelsDir = cv3ModelsDir, + let tokDir = cv3TokenizerDir, + let embFile = cv3EmbeddingsFile, + let specFile = cv3SpecialTokensFile, + let promptAssets = cv3PromptAssetsPath + else { + logger.error( + "cosyvoice3-text requires --text , --models-dir, --tokenizer-dir, --embeddings-file, --special-tokens-file, --prompt-assets" + ) + return + } + if #available(macOS 15, iOS 18, *) { + await CosyVoice3TextCLI.run( + text: inputText, + modelsDir: modelsDir, + tokenizerDir: tokDir, + embeddingsFile: embFile, + specialTokensFile: specFile, + promptAssetsPath: promptAssets, + outputPath: output, + seed: cv3Seed, + maxNewTokens: cv3MaxNewTokens, + cpuOnly: cv3CpuOnly) + } else { + logger.error( + "CosyVoice3 requires macOS 15 / iOS 18 (uses CoreML MLState).") + } + return + } + + if backend == .cosyvoice3 { + guard let fixture = cv3FixturePath, let modelsDir = cv3ModelsDir else { + logger.error( + "cosyvoice3-parity requires --fixture and --models-dir " + ) + return + } + if #available(macOS 15, iOS 18, *) { + await CosyVoice3ParityCLI.run( + fixturePath: fixture, + modelsDir: modelsDir, + referencePath: cv3ReferencePath, + outputPath: output, + seed: cv3Seed, + cpuOnly: cv3CpuOnly, + replayTokens: cv3ReplayTokens) + } else { + logger.error( + "CosyVoice3 requires macOS 15 / iOS 18 (uses CoreML MLState).") + } + return + } + guard let text = text else { printUsage() return @@ -863,7 +1052,14 @@ public struct TTS { Options: --output, -o Output WAV path (default: output.wav) --voice, -v Voice name (default: af_heart for Kokoro, alba for PocketTTS) - --backend TTS backend: kokoro (default), pocket, or kokoro-ane + --backend TTS backend: kokoro (default), pocket, kokoro-ane, + or cosyvoice3 [BETA — slow, RTFx < 1.0] + CosyVoice3 dev sub-backends: + cosyvoice3-parity Phase 1 fixture parity harness + cosyvoice3-frontend-parity lm_input_embeds parity vs Python + cosyvoice3-tokenizer-parity Qwen2 BPE round-trip + (Production cosyvoice3 backend auto-downloads + assets from HuggingFace on first synthesis.) --lexicon, -l Custom pronunciation lexicon file (word=phonemes format, Kokoro only) --benchmark Run a predefined benchmarking suite with multiple sentences --variant Force Kokoro 5s or 15s model (values: 5s,15s) diff --git a/Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift b/Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift new file mode 100644 index 000000000..e94184c40 --- /dev/null +++ b/Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift @@ -0,0 +1,81 @@ +import XCTest + +@testable import FluidAudio + +final class CosyVoice3ChineseNormalizerTests: XCTestCase { + + func testContainsChinese() { + XCTAssertTrue(CosyVoice3ChineseNormalizer.containsChinese("你好")) + XCTAssertTrue(CosyVoice3ChineseNormalizer.containsChinese("hello 世界")) + XCTAssertFalse(CosyVoice3ChineseNormalizer.containsChinese("hello world")) + XCTAssertFalse(CosyVoice3ChineseNormalizer.containsChinese("")) + } + + func testReplaceBlankDropsCjkInteriorSpaces() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceBlank("中 国"), "中国") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceBlank("hello world"), "hello world") + // Mixed: space between ASCII and CJK is dropped (one side non-ASCII). + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceBlank("hi 你好"), "hi你好") + } + + func testReplaceCornerMark() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceCornerMark("面积 5m²"), + "面积 5m平方") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.replaceCornerMark("体积 2m³"), + "体积 2m立方") + } + + func testRemoveBracket() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.removeBracket("你好(世界)"), + "你好世界") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.removeBracket("【注意】请勿触摸"), + "注意请勿触摸") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.removeBracket("a——b"), + "a b") + } + + func testSpellOutDigitsZh() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.spellOutDigitsZh("2024年"), + "二零二四年") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.spellOutDigitsZh("abc"), + "abc") + } + + func testStripTrailingCommaLikes() { + XCTAssertEqual( + CosyVoice3ChineseNormalizer.stripTrailingCommaLikes("你好,,"), + "你好。") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.stripTrailingCommaLikes("你好、,,"), + "你好。") + XCTAssertEqual( + CosyVoice3ChineseNormalizer.stripTrailingCommaLikes("你好。"), + "你好。") + } + + func testNormalizeEndToEnd() { + let input = "希望你以后能够做的比我还好用. 2024年,," + let out = CosyVoice3ChineseNormalizer.normalize(input) + // Period becomes 。, trailing commas collapse to a single 。, digits + // spelled out per-char, internal spaces between CJK stripped. + XCTAssertEqual(out, "希望你以后能够做的比我还好用。二零二四年。") + } + + func testIsOnlyPunctuation() { + XCTAssertTrue(CosyVoice3ChineseNormalizer.isOnlyPunctuation("")) + XCTAssertTrue(CosyVoice3ChineseNormalizer.isOnlyPunctuation("。,!?")) + XCTAssertTrue(CosyVoice3ChineseNormalizer.isOnlyPunctuation(".,!?")) + XCTAssertFalse(CosyVoice3ChineseNormalizer.isOnlyPunctuation("你好")) + XCTAssertFalse(CosyVoice3ChineseNormalizer.isOnlyPunctuation("abc")) + } +} diff --git a/Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift b/Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift new file mode 100644 index 000000000..e904d64ff --- /dev/null +++ b/Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift @@ -0,0 +1,101 @@ +import XCTest + +@testable import FluidAudio + +final class CosyVoice3PromptMelTests: XCTestCase { + + func testFrameCountMatchesMatchaFormula() throws { + // matcha/cosyvoice3: pad by 720 each side (reflect), center=False. + // For 48000 samples: padded = 48000 + 1440 = 49440. + // frames = (49440 - 1920) / 480 + 1 = 99 + 1 = 100. + let mel = CosyVoice3PromptMel() + let audio = [Float](repeating: 0.01, count: 48_000) + let out = try mel.compute(audio: audio) + XCTAssertEqual(out.frames, 100) + XCTAssertEqual(out.mel.count, 100 * 80) + } + + func testZeroAudioClampsToLogFloor() throws { + // With audio of all zeros, mel values are 0 → clamped to 1e-5 → log = -11.5129... + let mel = CosyVoice3PromptMel() + let audio = [Float](repeating: 0, count: 24_000) + let out = try mel.compute(audio: audio) + let expected: Float = log(Float(1e-5)) + for v in out.mel { + XCTAssertEqual(v, expected, accuracy: 1e-5) + } + } + + func testSinePeakInLowMelBins() throws { + // 200 Hz sine at 24 kHz should light up one of the lowest mel bins + // (fmin=0, the first few triangles cover 0..~200 Hz). + let mel = CosyVoice3PromptMel() + let sr: Float = 24_000 + let f: Float = 200 + let n = 12_000 // 0.5 s + var audio = [Float](repeating: 0, count: n) + for i in 0..0. + let numFreqBins = 1920 / 2 + 1 + for m in 0..<80 { + var sum: Float = 0 + for f in 0..