diff --git a/Documentation/README.md b/Documentation/README.md
index 540e33a06..135c833e1 100644
--- a/Documentation/README.md
+++ b/Documentation/README.md
@@ -40,6 +40,7 @@
 - [Kokoro](TTS/Kokoro.md)
 - [Kokoro ANE (7-stage)](TTS/KokoroAne.md)
 - [PocketTTS](TTS/PocketTTS.md)
+- [CosyVoice3 (Mandarin, beta)](TTS/CosyVoice3.md)
 - [SSML](TTS/SSML.md)
 - [Voice Quality Comparison](TTS/voice-quality.md)
 
diff --git a/Documentation/TTS/CosyVoice3.md b/Documentation/TTS/CosyVoice3.md
new file mode 100644
index 000000000..7308e2084
--- /dev/null
+++ b/Documentation/TTS/CosyVoice3.md
@@ -0,0 +1,246 @@
+# CosyVoice3 Swift Inference
+
+Mandarin zero-shot voice cloning via Qwen2 LM + CFM Flow + HiFT vocoder,
+running on CoreML.
+
+> ⚠️ **Beta / experimental.** End-to-end synthesis is currently slow on
+> Apple Silicon — RTFx < 1.0 typical, several seconds of latency for
+> short Mandarin utterances. The slowdown is partly the Flow CFM stage
+> (fp32, CPU-or-GPU only because fp16 + ANE produces NaNs through the
+> fused `layer_norm` — CoreMLTools limitation, tracked upstream) and
+> partly HiFT sinegen / windowing ops that fall back to CPU. May be a
+> model issue, may be recoverable through better conversion. Treat
+> performance numbers as preliminary; the Swift API, model layout, and
+> prompt-asset format may change in subsequent releases without
+> deprecation aliases.
+
+## Files
+
+| File | Role |
+|------|------|
+| `CosyVoice3TtsManager.swift` | Public actor — `initialize()`, `synthesize()`, `synthesizeFromFixture()`, `loadVoice()`, `downloadAndCreate()` |
+| `CosyVoice3Models.swift` | The 4 CoreML model handles (prefill, decode, flow, hift) |
+| `Assets/CosyVoice3ModelStore.swift` | Loads + compiles the four mlpackages, probes flat / nested layouts |
+| `Assets/CosyVoice3ResourceDownloader.swift` | HuggingFace pull for `FluidInference/CosyVoice3-0.5B-coreml` |
+| `Pipeline/Synthesize/CosyVoice3Synthesizer.swift` | Actor — prefill → decode loop → Flow → HiFT |
+| `Pipeline/Synthesize/CosyVoice3RasSampler.swift` | top-p / top-k / repetition mask, seed-tokens bypass |
+| `Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift` | mmap of 6761×896 fp16 speech-embedding table (12 MB) |
+| `Pipeline/Synthesize/CosyVoice3Types.swift` | `CosyVoice3SynthesisOptions`, `CosyVoice3SynthesisResult`, `CosyVoice3ParityOptions` |
+| `Pipeline/Preprocess/CosyVoice3TextFrontend.swift` | Special-token splitting + `lm_input_embeds` assembly |
+| `Pipeline/Preprocess/Qwen2BpeTokenizer.swift` | tiktoken-compatible byte-level BPE, 151 936 vocab (incl. fileprivate `ByteEncoder` 188-symbol byte→unicode shim) |
+| `Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift` | mmap of 151 936×896 fp16 text embedding table |
+| `Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift` | Minimal regex-free port of `frontend_utils.py` |
+| `Pipeline/Preprocess/CosyVoice3PromptMel.swift` | 24 kHz 80-bin log-mel matching `matcha audio.py` |
+| `Pipeline/Preprocess/CosyVoice3PromptAssets.swift` | Voice-prompt bundle DTO (precomputed IDs / mel / spk-emb) |
+| `Pipeline/Preprocess/CosyVoice3FrontendFixture.swift` | Phase 1 parity-fixture loader |
+| `CosyVoice3Constants.swift` | Stop-token range, hidden dim, frame counts, etc. |
+| `Shared/SafetensorsReader.swift` | ~170 LoC pure-Swift mmap + fp16/fp32/i32 accessors |
+
+## Call Flow
+
+```
+CosyVoice3TtsManager.synthesize(text:promptAssets:options:)
+  |
+  v
+CosyVoice3TextFrontend.assembleLmInput(text:promptAssets:)
+  |
+  |-- normalizeText()           split on <|endofprompt|>, replace_blank, etc.
+  |-- Qwen2BpeTokenizer.encode  byte-level BPE → token IDs
+  |-- text_embedding lookup     151 936×896 fp16 mmap → [N_text, 896]
+  |-- speech_embedding lookup   6761×896 fp16 mmap → [N_speech, 896]
+  |-- concat([SOS, text, TASK, prompt_speech_ids]) → lm_input_embeds
+  |
+  v
+CosyVoice3Synthesizer.synthesize(lm_input_embeds:promptAssets:)
+  |
+  |-- runPrefill()              Qwen2 24L prefill, T <= 256
+  |     |-- in: lm_input_embeds, attn_mask
+  |     |-- out: logits[1,T,6761], kv_cache[24,1,2,768,64] fp16
+  |
+  |-- DECODE LOOP (until stop-range hit or maxNewTokens):
+  |     |
+  |     |-- runDecodeStep()         takes prev token + cached KV
+  |     |     |-- in: token_id, kv_cache (in-place state)
+  |     |     |-- out: logits[1,1,6761]
+  |     |
+  |     |-- RasSampler.sample()     top-p/top-k/repetition + seed-tokens bypass
+  |     |-- if topId in stopRange (6561...6760): break
+  |     |-- decoded.append(topId)
+  |
+  |-- runFlow()                 CFM 10-step ODE, conditional on prompt mel + spk_emb
+  |     |-- in: decoded[N], prompt_mel, spk_embedding
+  |     |-- out: full_mel[1, 80, M] fp32
+  |
+  |-- runHiFT()                 vocoder, chunk-packed (T<=500 frames)
+  |     |-- in: full_mel slice from newMelStart..newMelStart+newMelFrames
+  |     |-- out: audio samples [N*hop_len] @ 24 kHz
+  |
+  |-- concatenate chunks → CosyVoice3SynthesisResult.samples
+```
+
+## Public API
+
+```swift
+import FluidAudio
+
+// One-shot creation that downloads everything to ~/.cache/fluidaudio/
+let manager = try await CosyVoice3TtsManager.downloadAndCreate(
+    computeUnits: .cpuAndNeuralEngine
+)
+try await manager.initialize()
+
+// Load a voice prompt bundle (precomputed by mobius/.../bootstrap_aishell3_voices.py)
+let voice = try CosyVoice3PromptAssets.load(from: voiceBundleURL)
+
+let result = try await manager.synthesize(
+    text: "希望你以后能够做的比我还好用",
+    promptAssets: voice,
+    options: CosyVoice3SynthesisOptions(maxNewTokens: 1024, seed: 42)
+)
+// result.samples : [Float]   (mono fp32, 24 kHz)
+// result.sampleRate : 24000
+```
+
+`CosyVoice3SynthesisOptions`:
+
+| Field | Default | Notes |
+|---|---|---|
+| `maxNewTokens` | `nil` (cap = 1024) | Hard ceiling on speech-token count |
+| `seed` | 42 | Drives the RAS sampler RNG; reproducible runs |
+
+`CosyVoice3SynthesisResult`:
+
+| Field | Type | Notes |
+|---|---|---|
+| `samples` | `[Float]` | mono, fp32, range ~[-1.0, 1.0] |
+| `sampleRate` | `Int` | always 24000 |
+| `generatedTokenCount` | `Int` | tokens before EOS |
+| `decodedTokens` | `[Int32]` | full speech token sequence (debug) |
+
+## Key State
+
+### KV cache (`kv_cache[24, 1, 2, 768, 64]` fp16)
+- 24 transformer layers × `[K,V]` × heads × dim, packed into one `MLState`-style
+  `MLMultiArray` that the prefill produces and the decode loop both reads
+  and overwrites in-place.
+- Reset per `synthesize()` call.
+
+### Prompt assets (`CosyVoice3PromptAssets`)
+- `promptText` — Mandarin reference text (must contain `<|endofprompt|>`).
+- `promptSpeechIds: [Int32]` — pre-tokenized speech IDs from the
+  SpeechTokenizerV3 mlpackage (computed offline, reused across calls).
+- `promptMel: [Float]`, `promptMelFrames` — 80-bin log-mel of the reference
+  audio at 24 kHz.
+- `spkEmbedding: [Float]` — 192-dim speaker embedding from CAMPPlus.
+
+Bundles are produced by
+`mobius/models/tts/cosyvoice3/coreml/verify/bootstrap_aishell3_voices.py`
+or `extract_voice_prompt.py` for arbitrary speakers.
+
+## CoreML details
+
+- **Compute units:** caller chooses (`.cpuAndNeuralEngine` works for
+  prefill + decode + HiFT). Flow is forced to `.cpuAndGPU` regardless —
+  fp32 graph, ANE NaNs through the fused `layer_norm`.
+- All four mlpackages compiled `.mlpackage → .mlmodelc` on first load and
+  cached on disk under `~/.cache/fluidaudio/Models/cosyvoice3/`.
+- `CosyVoice3ModelStore` is an actor; `CosyVoice3Synthesizer` is an
+  actor. `CosyVoice3Models` (the four-tuple) conforms to `Sendable` via
+  `@preconcurrency import CoreML`, matching the existing `TtsModels`
+  pattern.
+
+## Stop-token handling
+
+- Speech vocab is `0..<6761`; tokens `6561..<6761` are the EOS range.
+- `CosyVoice3Constants.stopRange = 6561...6760` (closed range). The decode
+  loop breaks when `topId` falls in that range.
+- If the prefill emits a stop token at step 0 the synthesizer throws
+  `CosyVoice3Error.predictionFailed` instead of falling through —
+  feeding the stop-token embedding into the decode loop would
+  accumulate semantically meaningless tokens.
+
+## CLI
+
+```
+fluidaudio tts --backend cosyvoice3 \
+    --text "希望你以后能够做的比我还好用" \
+    --models-dir ~/.cache/fluidaudio/Models/cosyvoice3 \
+    --tokenizer-dir … --embeddings-file … --special-tokens-file … \
+    --prompt-assets path/to/voice.safetensors \
+    --output out.wav
+```
+
+`--backend cosyvoice3` (and the `cv3` alias) runs the production
+text-driven synthesis path. `--backend` help text flags it as
+`[BETA — slow, RTFx < 1.0]` and the dispatcher emits a runtime
+`logger.warning` so the beta status shows up without reading docs.
+
+### Dev sub-backends (for debugging the Python ↔ Swift contract)
+
+These are the harnesses future contributors use to bisect divergence
+between the Swift port and the upstream Python reference. Each isolates
+a distinct stage of the pipeline:
+
+```
+fluidaudio tts --backend cosyvoice3-tokenizer-parity \
+    --tokenizer-dir … --fixture tokenizer_fixture.json
+# Qwen2 BPE encode/decode parity vs tiktoken reference
+
+fluidaudio tts --backend cosyvoice3-frontend-parity \
+    --tokenizer-dir … --embeddings-file … \
+    --fixture shipping.safetensors --tok-fixture …
+# lm_input_embeds assembly parity (text+speech embed lookup, SOS/TASK splice)
+
+fluidaudio tts --backend cosyvoice3-parity \
+    --fixture shipping.safetensors --models-dir build/
+# Phase 1 fixture parity (Synthesizer: prefill → decode → Flow → HiFT)
+```
+
+Recommended bisection order when end-to-end output diverges from
+Python: tokenizer-parity → frontend-parity → fixture parity.
+
+The production backend auto-downloads its CoreML mlpackages, tokenizer,
+embeddings, and default voice from HuggingFace on first synthesis (cached
+under `~/.cache/fluidaudio/Models/cosyvoice3/`) — there is no separate
+download CLI mode, matching how Kokoro and PocketTTS work.
+
+## Models
+
+| Component | mlpackage | Precision | Notes |
+|---|---|---|---|
+| Qwen2 LLM — Prefill (T=256, M=768) | `LLM-Prefill-T256-M768-fp16` | fp16 | KV-cache out |
+| Qwen2 LLM — Decode (M=768) | `LLM-Decode-M768-fp16` | fp16 | KV-cache in-place |
+| CFM Flow (N=250 → M=500 mel) | `Flow-N250-fp32` | fp32 | CPU/GPU only |
+| HiFT vocoder (T=500 → 10 s @ 24 kHz) | `HiFT-T500-fp16` | fp16 | sinegen on CPU |
+| Qwen2 + speech embedding tables | `embeddings-fp16.safetensors` | fp16 | mmap'd at runtime |
+
+All shipped at
+[`FluidInference/CosyVoice3-0.5B-coreml`](https://huggingface.co/FluidInference/CosyVoice3-0.5B-coreml).
+The conversion pipeline that produced them lives in
+[FluidInference/mobius#42](https://github.com/FluidInference/mobius/pull/42).
+
+## Non-goals / known limits
+
+- **No on-device prompt-asset preparation.** SpeechTokenizerV3 and
+  CAMPPlus have CoreML mlpackages but the surrounding DSP isn't ported
+  to Swift yet. Callers either use the bundled
+  `cosyvoice3-default-zh` voice or run the Python `extract_voice_prompt.py`
+  offline.
+- **No production-grade Mandarin TN.** `CosyVoice3ChineseNormalizer`
+  only mirrors the simple cleanups in upstream `frontend_utils.py`.
+  For year / currency / decimal / unit normalization, run
+  `wetext.ZhNormalizer` server-side and pass `prenormalized: true` on
+  `synthesize()`.
+- **Flow stays fp32 (~1.2 GB).** Until CoreMLTools pins fused-`layer_norm`
+  fp16 the model NaNs on ANE. Loaded once, kept resident.
+- **Streaming API not yet exposed.** The synthesizer runs Phase 1
+  (prefill) and Phase 2 (Flow + HiFT) sequentially against the full
+  token sequence. Token streaming is internal but not surfaced through
+  an `AsyncStream`.
+
+## License
+
+- **CosyVoice3 model weights:** Apache 2.0, inherited from
+  [FunAudioLLM/CosyVoice](https://github.com/FunAudioLLM/CosyVoice)
+  upstream (`speech_300m`, `Fun-CosyVoice3-0.5B-2512`).
+- **FluidAudio SDK:** Apache 2.0.
diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
index 68d058731..437b0422f 100644
--- a/Sources/FluidAudio/ModelNames.swift
+++ b/Sources/FluidAudio/ModelNames.swift
@@ -29,6 +29,7 @@ public enum Repo: String, CaseIterable, Sendable {
     case qwen3AsrInt8 = "FluidInference/qwen3-asr-0.6b-coreml/int8"
     case multilingualG2p = "FluidInference/charsiu-g2p-byt5-coreml"
     case parakeetTdtCtc110m = "FluidInference/parakeet-tdt-ctc-110m-coreml"
+    case cosyvoice3 = "FluidInference/CosyVoice3-0.5B-coreml"
     case cohereTranscribeCoreml = "FluidInference/cohere-transcribe-03-2026-coreml/q8"
 
     /// Repository slug (without owner)
@@ -82,6 +83,8 @@ public enum Repo: String, CaseIterable, Sendable {
             return "charsiu-g2p-byt5-coreml"
         case .parakeetTdtCtc110m:
             return "parakeet-tdt-ctc-110m-coreml"
+        case .cosyvoice3:
+            return "CosyVoice3-0.5B-coreml"
         case .cohereTranscribeCoreml:
             return "cohere-transcribe-03-2026-coreml/q8"
         }
@@ -178,6 +181,8 @@ public enum Repo: String, CaseIterable, Sendable {
             return "parakeet-ja"
         case .parakeetTdtCtc110m:
             return "parakeet-tdt-ctc-110m"
+        case .cosyvoice3:
+            return "cosyvoice3"
         case .cohereTranscribeCoreml:
             return "cohere-transcribe/q8"
         default:
@@ -596,6 +601,47 @@ public enum ModelNames {
         ]
     }
 
+    /// CosyVoice3 (Mandarin) model names. Files live on HuggingFace at
+    /// `FluidInference/CosyVoice3-0.5B-coreml` (see `Repo.cosyvoice3`). The
+    /// expected local directory layout is encoded in `CosyVoice3Constants.Files`.
+    public enum CosyVoice3 {
+        public static let llmPrefill = "LLM-Prefill-T256-M768-fp16"
+        public static let llmDecode = "LLM-Decode-M768-fp16-stateful"
+        public static let flow = "Flow-N250-fp16"
+        public static let hift = "HiFT-T500-fp16"
+        public static let speechEmbeddings = "speech_embedding-fp16.safetensors"
+
+        public static let llmPrefillFile = llmPrefill + ".mlmodelc"
+        public static let llmDecodeFile = llmDecode + ".mlmodelc"
+        public static let flowFile = flow + ".mlmodelc"
+        public static let hiftFile = hift + ".mlmodelc"
+
+        public static let requiredModels: Set<String> = [
+            llmPrefillFile,
+            llmDecodeFile,
+            flowFile,
+            hiftFile,
+        ]
+
+        /// Sidecar assets living under subdirectories of the HF repo (not part
+        /// of `requiredModels`; pulled via `downloadSubdirectory` / direct file
+        /// fetch by `CosyVoice3ResourceDownloader`).
+        public enum Sidecar {
+            public static let embeddingsDir = "embeddings"
+            public static let tokenizerDir = "tokenizer"
+            public static let voicesDir = "voices"
+
+            public static let speechEmbeddings = "speech_embedding-fp16.safetensors"
+            public static let runtimeEmbeddings = "embeddings-runtime-fp32.safetensors"
+            public static let specialTokens = "special_tokens.json"
+            public static let vocab = "vocab.json"
+            public static let merges = "merges.txt"
+            public static let tokenizerConfig = "tokenizer_config.json"
+
+            public static let defaultVoiceId = "cosyvoice3-default-zh"
+        }
+    }
+
     /// Multilingual G2P (CharsiuG2P ByT5) model names
     public enum MultilingualG2P {
         public static let encoder = "MultilingualG2PEncoder"
@@ -798,6 +844,8 @@ public enum ModelNames {
             return ModelNames.Qwen3ASR.requiredModelsFull
         case .multilingualG2p:
             return ModelNames.MultilingualG2P.requiredModels
+        case .cosyvoice3:
+            return ModelNames.CosyVoice3.requiredModels
         case .cohereTranscribeCoreml:
             return ModelNames.CohereTranscribe.requiredModels
         }
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift
new file mode 100644
index 000000000..7051143f7
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift
@@ -0,0 +1,186 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Actor-based store for the four CosyVoice3 CoreML models.
+///
+/// Two on-disk layouts are accepted:
+///
+/// 1. **HuggingFace cache** (flat): `<dir>/<ModelName>.mlmodelc` (or
+///    `.mlpackage`) at repo root, with `<dir>/embeddings/speech_embedding-fp16.safetensors`.
+///    This is what `CosyVoice3ResourceDownloader` produces.
+///
+/// 2. **Local mobius build dir**: `<dir>/<subdir>/<ModelName>.mlpackage` as
+///    emitted by `models/tts/cosyvoice3/coreml/convert-coreml.py` (with
+///    `llm-fp16/`, `flow-fp16-n250/`, `hift-fp16-t500/` subdirs).
+///
+/// The store probes layout (1) first, then falls back to (2). CoreML
+/// auto-compiles `.mlpackage` on first load and caches the compiled bundle on
+/// disk.
+public actor CosyVoice3ModelStore {
+
+    private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "CosyVoice3ModelStore")
+
+    public nonisolated let directory: URL
+    private let computeUnits: MLComputeUnits
+
+    private var loadedModels: CosyVoice3Models?
+    private var speechEmbeddingsURL: URL?
+
+    /// - Parameters:
+    ///   - directory: Base build directory that contains
+    ///     `llm-fp16/`, `llm-fp16-stateful/`, `flow-fp16-n250/`,
+    ///     `hift-fp16-t500/`, `embeddings/`.
+    ///   - computeUnits: Defaults to `.cpuAndNeuralEngine`. Applied to
+    ///     LLM-Prefill + HiFT models only. LLM-Decode (stateful) and Flow
+    ///     both force `.cpuAndGPU` regardless (see `loadIfNeeded()`).
+    public init(directory: URL, computeUnits: MLComputeUnits = .cpuAndNeuralEngine) {
+        self.directory = directory
+        self.computeUnits = computeUnits
+    }
+
+    /// Load all four CoreML models. Idempotent.
+    public func loadIfNeeded() async throws {
+        guard loadedModels == nil else { return }
+
+        let config = MLModelConfiguration()
+        config.computeUnits = computeUnits
+
+        let loadStart = Date()
+        logger.info("Loading CosyVoice3 CoreML models from \(directory.path)...")
+
+        let prefillURL = try resolveModel(
+            subdir: CosyVoice3Constants.Files.llmPrefillSubdir,
+            baseName: ModelNames.CosyVoice3.llmPrefill)
+        let decodeURL = try resolveModel(
+            subdir: CosyVoice3Constants.Files.llmDecodeSubdir,
+            baseName: ModelNames.CosyVoice3.llmDecode)
+        let flowURL = try resolveModel(
+            subdir: CosyVoice3Constants.Files.flowSubdir,
+            baseName: ModelNames.CosyVoice3.flow)
+        let hiftURL = try resolveModel(
+            subdir: CosyVoice3Constants.Files.hiftSubdir,
+            baseName: ModelNames.CosyVoice3.hift)
+        let embeddingsURL = try resolveAsset(
+            subdir: CosyVoice3Constants.Files.speechEmbeddingsSubdir,
+            file: CosyVoice3Constants.Files.speechEmbeddings)
+
+        let prefill = try await compileAndLoad(prefillURL, configuration: config)
+        logger.info("Loaded \(CosyVoice3Constants.Files.llmPrefill)")
+
+        // Stateful decode MUST run on `.cpuAndGPU`:
+        //   - ANE refuses to compile the stateful graph (same failure mode
+        //     as Flow: `MILCompilerForANE ANECCompile() FAILED`), so
+        //     `.cpuAndNE` / `.all` deadlock load
+        //   - CPU-only works but is ~2× slower than the GPU path
+        // Ignore the user-supplied `computeUnits` for decode.
+        let decodeConfig = MLModelConfiguration()
+        decodeConfig.computeUnits = .cpuAndGPU
+        let decode = try await compileAndLoad(decodeURL, configuration: decodeConfig)
+        logger.info("Loaded \(CosyVoice3Constants.Files.llmDecode)")
+
+        // Flow runs on `.cpuAndGPU` (fp16). An ANE-port attempt (BC1S
+        // rewrite: Linear→Conv2d(1×1), LayerNorm on axis=1, manual SDPA,
+        // pre-baked rotary sin/cos) produced a Flow that *compiled* and
+        // ran ~3× faster, but numerically broken: on the parity
+        // fixture the ANE graph collapses the mel dynamic range from
+        // [-12.5, +5.2] to [-10.1, -0.8] (MAE 2.58 vs PyTorch fp32;
+        // plan required <1e-3), yielding HiFT audio at ~40× lower peak
+        // amplitude — unintelligible to both CTC-ZH and Qwen3 ASR.
+        // Reverted to the cpuAndGPU fp16 baseline. See
+        // `coreml/TRIALS_AND_ERRORS.md` "Flow ANE port" for the full
+        // journey including the residual 77-op `conv_pos_embed` CPU
+        // island that may have been masking the dynamic-range
+        // compression introduced elsewhere in the BC1S rewrite.
+        // Ignore the user-supplied `computeUnits` for Flow; apply it to
+        // the LLM + HiFT models only.
+        let flowConfig = MLModelConfiguration()
+        flowConfig.computeUnits = .cpuAndGPU
+        let flow = try await compileAndLoad(flowURL, configuration: flowConfig)
+        logger.info("Loaded \(CosyVoice3Constants.Files.flow)")
+
+        let hift = try await compileAndLoad(hiftURL, configuration: config)
+        logger.info("Loaded \(CosyVoice3Constants.Files.hift)")
+
+        loadedModels = CosyVoice3Models(prefill: prefill, decode: decode, flow: flow, hift: hift)
+        speechEmbeddingsURL = embeddingsURL
+
+        let elapsed = Date().timeIntervalSince(loadStart)
+        logger.info("All CosyVoice3 models loaded in \(String(format: "%.2f", elapsed))s")
+    }
+
+    public func models() throws -> CosyVoice3Models {
+        guard let models = loadedModels else {
+            throw CosyVoice3Error.notInitialized
+        }
+        return models
+    }
+
+    public func speechEmbeddingsFileURL() throws -> URL {
+        guard let url = speechEmbeddingsURL else {
+            throw CosyVoice3Error.notInitialized
+        }
+        return url
+    }
+
+    // MARK: - Helpers
+
+    /// Resolve a CoreML model accepting either `.mlmodelc` or `.mlpackage`
+    /// extensions and both layouts: flat (HF) or subdir (local build).
+    private func resolveModel(subdir: String, baseName: String) throws -> URL {
+        let candidates: [URL] = [
+            // HF flat layout prefers the precompiled .mlmodelc.
+            directory.appendingPathComponent("\(baseName).mlmodelc"),
+            directory.appendingPathComponent("\(baseName).mlpackage"),
+            // Local build layout (mobius convert-coreml.py output).
+            directory.appendingPathComponent(subdir).appendingPathComponent("\(baseName).mlmodelc"),
+            directory.appendingPathComponent(subdir).appendingPathComponent("\(baseName).mlpackage"),
+        ]
+        for url in candidates where FileManager.default.fileExists(atPath: url.path) {
+            return url
+        }
+        let probed = candidates.map { $0.path }.joined(separator: ", ")
+        throw CosyVoice3Error.modelFileNotFound(probed)
+    }
+
+    /// Resolve a plain sidecar file (e.g. `speech_embedding-fp16.safetensors`).
+    /// Probes `<dir>/<subdir>/<file>` then `<dir>/<file>`.
+    private func resolveAsset(subdir: String, file: String) throws -> URL {
+        let candidates: [URL] = [
+            directory.appendingPathComponent(subdir).appendingPathComponent(file),
+            directory.appendingPathComponent(file),
+        ]
+        for url in candidates where FileManager.default.fileExists(atPath: url.path) {
+            return url
+        }
+        let probed = candidates.map { $0.path }.joined(separator: ", ")
+        throw CosyVoice3Error.modelFileNotFound(probed)
+    }
+
+    /// Compile an .mlpackage to .mlmodelc (cached in a persistent temp dir
+    /// next to the original package) and load it. Skips compilation if an
+    /// already-compiled .mlmodelc exists next to the package.
+    private func compileAndLoad(
+        _ url: URL,
+        configuration: MLModelConfiguration
+    ) async throws -> MLModel {
+        if url.pathExtension == "mlmodelc" {
+            return try MLModel(contentsOf: url, configuration: configuration)
+        }
+        let base = url.deletingPathExtension().lastPathComponent
+        let compiledName = base + ".mlmodelc"
+        let cached = url.deletingLastPathComponent().appendingPathComponent(compiledName)
+        if FileManager.default.fileExists(atPath: cached.path) {
+            return try MLModel(contentsOf: cached, configuration: configuration)
+        }
+        let compiledURL = try await MLModel.compileModel(at: url)
+        // Move into place next to the package so subsequent loads are fast.
+        try? FileManager.default.removeItem(at: cached)
+        do {
+            try FileManager.default.moveItem(at: compiledURL, to: cached)
+            return try MLModel(contentsOf: cached, configuration: configuration)
+        } catch {
+            // If the move fails (e.g. cross-device), load from the temp URL.
+            return try MLModel(contentsOf: compiledURL, configuration: configuration)
+        }
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift
new file mode 100644
index 000000000..7359ddd43
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ResourceDownloader.swift
@@ -0,0 +1,218 @@
+import Foundation
+
+/// Pulls CosyVoice3 CoreML models + runtime assets from the
+/// `FluidInference/CosyVoice3-0.5B-coreml` HuggingFace repo.
+///
+/// Layout produced on disk (relative to `ensureCoreModels(...)`'s return URL):
+///
+/// ```
+/// <repoDirectory>/
+/// ├── LLM-Prefill-T256-M768-fp16.mlmodelc/
+/// ├── LLM-Decode-M768-fp16.mlmodelc/
+/// ├── Flow-N250-fp16.mlmodelc/
+/// ├── HiFT-T500-fp16.mlmodelc/
+/// ├── embeddings/
+/// │   ├── speech_embedding-fp16.safetensors
+/// │   └── embeddings-runtime-fp32.safetensors   (text-mode only)
+/// ├── tokenizer/
+/// │   ├── vocab.json, merges.txt, tokenizer_config.json, special_tokens.json
+/// └── voices/
+///     ├── cosyvoice3-default-zh.safetensors + .json   (default voice, eager)
+///     └── <voice-id>.safetensors + .json              (optional, on-demand)
+/// ```
+public enum CosyVoice3ResourceDownloader {
+
+    private static let logger = AppLogger(
+        subsystem: "com.fluidaudio.tts", category: "CosyVoice3ResourceDownloader")
+
+    /// Path bundle produced by `ensureTextFrontendAssets`.
+    public struct TextFrontendPaths: Sendable {
+        public let tokenizerDirectory: URL
+        public let runtimeEmbeddingsFile: URL
+        public let specialTokensFile: URL
+    }
+
+    // MARK: - Core models + speech embedding table
+
+    /// Ensure the four `.mlmodelc` bundles and `speech_embedding-fp16.safetensors`
+    /// are cached locally. Returns the repository root directory.
+    ///
+    /// - Parameters:
+    ///   - directory: Optional base cache dir. When `nil`, defaults to
+    ///     `~/.cache/fluidaudio` (macOS) or `Caches/fluidaudio` (iOS).
+    ///   - progressHandler: Forwarded to `DownloadUtils.downloadRepo`.
+    @discardableResult
+    public static func ensureCoreModels(
+        directory: URL? = nil,
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> URL {
+        let targetDir = try directory ?? cacheDirectory()
+        let modelsDirectory = targetDir.appendingPathComponent(
+            CosyVoice3Constants.defaultModelsSubdirectory)
+        let repoDir = modelsDirectory.appendingPathComponent(Repo.cosyvoice3.folderName)
+
+        // 1. Fetch the four .mlmodelc bundles via the standard repo downloader.
+        let modelsPresent = ModelNames.CosyVoice3.requiredModels.allSatisfy { name in
+            FileManager.default.fileExists(
+                atPath: repoDir.appendingPathComponent(name).path)
+        }
+        if !modelsPresent {
+            logger.info("Downloading CosyVoice3 .mlmodelc bundles from HuggingFace...")
+            try await DownloadUtils.downloadRepo(
+                .cosyvoice3,
+                to: modelsDirectory,
+                progressHandler: progressHandler)
+        } else {
+            logger.info("CosyVoice3 .mlmodelc bundles found in cache")
+        }
+
+        // 2. Fetch the small speech-embedding table (sidecar, not a model).
+        _ = try await ensureSidecarFile(
+            subdir: ModelNames.CosyVoice3.Sidecar.embeddingsDir,
+            name: ModelNames.CosyVoice3.Sidecar.speechEmbeddings,
+            repoDirectory: repoDir,
+            description: "CosyVoice3 speech embedding table")
+
+        return repoDir
+    }
+
+    // MARK: - Text-mode assets (tokenizer + 542 MB runtime embeddings)
+
+    /// Ensure tokenizer assets + `embeddings-runtime-fp32.safetensors` are on
+    /// disk. Only required when using `CosyVoice3TtsManager.synthesize(text:…)`;
+    /// fixture-mode callers may skip this.
+    public static func ensureTextFrontendAssets(
+        repoDirectory: URL
+    ) async throws -> TextFrontendPaths {
+        // Tokenizer subdirectory: vocab.json + merges.txt + special_tokens.json
+        // + tokenizer_config.json. `downloadSubdirectory` walks the tree and
+        // skips files already on disk.
+        let tokenizerDir = repoDirectory.appendingPathComponent(
+            ModelNames.CosyVoice3.Sidecar.tokenizerDir)
+        let tokenizerRequired = [
+            ModelNames.CosyVoice3.Sidecar.vocab,
+            ModelNames.CosyVoice3.Sidecar.merges,
+            ModelNames.CosyVoice3.Sidecar.specialTokens,
+        ]
+        let tokenizerPresent = tokenizerRequired.allSatisfy { name in
+            FileManager.default.fileExists(
+                atPath: tokenizerDir.appendingPathComponent(name).path)
+        }
+        if !tokenizerPresent {
+            logger.info("Downloading CosyVoice3 tokenizer assets…")
+            try await DownloadUtils.downloadSubdirectory(
+                .cosyvoice3,
+                subdirectory: ModelNames.CosyVoice3.Sidecar.tokenizerDir,
+                to: repoDirectory)
+        }
+
+        // Runtime text-embedding table (542 MB). Pulled as a file download so
+        // it never has to sit in RAM during transfer.
+        let runtimeEmbeddings = try await ensureSidecarFile(
+            subdir: ModelNames.CosyVoice3.Sidecar.embeddingsDir,
+            name: ModelNames.CosyVoice3.Sidecar.runtimeEmbeddings,
+            repoDirectory: repoDirectory,
+            description: "CosyVoice3 runtime text embedding table (542 MB)")
+
+        return TextFrontendPaths(
+            tokenizerDirectory: tokenizerDir,
+            runtimeEmbeddingsFile: runtimeEmbeddings,
+            specialTokensFile: tokenizerDir.appendingPathComponent(
+                ModelNames.CosyVoice3.Sidecar.specialTokens))
+    }
+
+    // MARK: - Voice bundles
+
+    /// Ensure the requested zero-shot voice bundle (`<id>.safetensors` +
+    /// `<id>.json`) is cached. Returns the `.safetensors` URL that
+    /// `CosyVoice3PromptAssets.load(from:)` expects — the loader derives the
+    /// `.json` sidecar path from it.
+    @discardableResult
+    public static func ensureVoice(
+        voiceId: String = ModelNames.CosyVoice3.Sidecar.defaultVoiceId,
+        repoDirectory: URL
+    ) async throws -> URL {
+        let sanitized = voiceId.filter { $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }
+        guard !sanitized.isEmpty, sanitized == voiceId else {
+            throw CosyVoice3Error.invalidShape("invalid voice id: \(voiceId)")
+        }
+
+        let voicesDir = repoDirectory.appendingPathComponent(
+            ModelNames.CosyVoice3.Sidecar.voicesDir)
+        try FileManager.default.createDirectory(
+            at: voicesDir, withIntermediateDirectories: true)
+
+        let tensorsURL = voicesDir.appendingPathComponent("\(voiceId).safetensors")
+        let metadataURL = voicesDir.appendingPathComponent("\(voiceId).json")
+
+        for (local, remoteName, desc) in [
+            (tensorsURL, "\(voiceId).safetensors", "voice tensors"),
+            (metadataURL, "\(voiceId).json", "voice metadata"),
+        ] {
+            if FileManager.default.fileExists(atPath: local.path) { continue }
+            let remotePath = "\(ModelNames.CosyVoice3.Sidecar.voicesDir)/\(remoteName)"
+            let remoteURL = try ModelRegistry.resolveModel(
+                Repo.cosyvoice3.remotePath, remotePath)
+            let descriptor = AssetDownloader.Descriptor(
+                description: "\(voiceId) \(desc)",
+                remoteURL: remoteURL,
+                destinationURL: local,
+                transferMode: .file())
+            _ = try await AssetDownloader.ensure(descriptor, logger: logger)
+        }
+
+        return tensorsURL
+    }
+
+    // MARK: - Helpers
+
+    private static func ensureSidecarFile(
+        subdir: String,
+        name: String,
+        repoDirectory: URL,
+        description: String
+    ) async throws -> URL {
+        let localDir = repoDirectory.appendingPathComponent(subdir)
+        try FileManager.default.createDirectory(
+            at: localDir, withIntermediateDirectories: true)
+        let localURL = localDir.appendingPathComponent(name)
+        if FileManager.default.fileExists(atPath: localURL.path) {
+            return localURL
+        }
+        let remotePath = "\(subdir)/\(name)"
+        let remoteURL = try ModelRegistry.resolveModel(
+            Repo.cosyvoice3.remotePath, remotePath)
+        let descriptor = AssetDownloader.Descriptor(
+            description: description,
+            remoteURL: remoteURL,
+            destinationURL: localURL,
+            transferMode: .file())
+        return try await AssetDownloader.ensure(descriptor, logger: logger)
+    }
+
+    /// `~/.cache/fluidaudio` (macOS) / `Caches/fluidaudio` (iOS) — matches the
+    /// convention used by `TtsResourceDownloader` and `PocketTtsResourceDownloader`.
+    private static func cacheDirectory() throws -> URL {
+        let baseDirectory: URL
+        #if os(macOS)
+        baseDirectory = FileManager.default.homeDirectoryForCurrentUser
+            .appendingPathComponent(".cache")
+        #else
+        guard
+            let first = FileManager.default.urls(
+                for: .cachesDirectory, in: .userDomainMask
+            ).first
+        else {
+            throw CosyVoice3Error.invalidShape("failed to locate caches directory")
+        }
+        baseDirectory = first
+        #endif
+
+        let cacheDirectory = baseDirectory.appendingPathComponent("fluidaudio")
+        if !FileManager.default.fileExists(atPath: cacheDirectory.path) {
+            try FileManager.default.createDirectory(
+                at: cacheDirectory, withIntermediateDirectories: true)
+        }
+        return cacheDirectory
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift
new file mode 100644
index 000000000..b0a46f935
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Constants.swift
@@ -0,0 +1,78 @@
+import Foundation
+
+/// Central constants for the CosyVoice3 (Mandarin) CoreML pipeline.
+///
+/// Shipping config (frozen):
+/// - LLM-Prefill-T256-M768-fp16           (cpuAndNeuralEngine)
+/// - LLM-Decode-M768-fp16-stateful        (cpuAndGPU — see note)
+/// - Flow-N250-fp16                       (cpuAndGPU — an ANE-port
+///   BC1S rewrite was attempted and reverted: the converted graph ran
+///   ~3× faster but numerically broken (mel dynamic range collapsed
+///   from [-12.5, +5.2] to [-10.1, -0.8], MAE 2.58 vs fp32 reference,
+///   yielding HiFT audio at ~40× lower peak amplitude → unintelligible
+///   to ASR). See `coreml/TRIALS_AND_ERRORS.md` "Flow ANE port" for
+///   the full journey, including the residual 77-op CPU island in
+///   `input_embed.conv_pos_embed` (`Conv1d(1024,1024,k=31)+Mish`)
+///   that three rewrite attempts couldn't move — ANEF rejects the
+///   conv footprint regardless of group count.)
+/// - HiFT-T500-fp16                       (cpuAndNeuralEngine)
+///
+/// The stateful decode model uses per-layer `MLState` buffers for the
+/// KV cache (48 tensors, `[1, 2, 768, 64]` fp16 each) instead of
+/// round-tripping 18 MB of kv_k / kv_v MLMultiArrays every step. ANE
+/// refuses to compile the stateful graph (`MILCompilerForANE
+/// ANECCompile() FAILED`); decode therefore runs on `.cpuAndGPU`.
+/// Requires macOS 15 / iOS 18.
+public enum CosyVoice3Constants {
+
+    // MARK: - LLM shapes
+    public static let prefillLength = 256
+    public static let kvMaxLength = 768
+    public static let embedDim = 896
+    public static let numLayers = 24
+    public static let kvHeads = 2
+    public static let headDim = 64
+
+    // MARK: - Flow / HiFT shapes
+    public static let flowTotalTokens = 250
+    public static let tokenMelRatio = 2
+    public static let hiftMaxFrames = 500
+    public static let hiftSamplesPerFrame = 480
+    public static let sampleRate = 24_000
+    public static let melBins = 80
+    public static let speakerEmbeddingDim = 192
+
+    // MARK: - Speech token vocab
+    public static let speechVocab = 6_761
+    public static let speechTokenSize = 6_561
+    public static let sosId: Int32 = 6_561
+    public static let eosId: Int32 = 6_562
+    public static let taskId: Int32 = 6_563
+    /// Any token id in this range is treated as a stop signal.
+    public static let stopRange: ClosedRange<Int32> = 6_561...6_760
+
+    // MARK: - Sampler
+    public static let topP: Float = 0.8
+    public static let topK: Int = 25
+    public static let rasWindow: Int = 10
+    public static let rasTauR: Float = 0.1
+
+    // MARK: - Cache layout
+    /// Subdirectory under the shared `~/.cache/fluidaudio/` (or iOS Caches) dir
+    /// where every TTS backend stores its HF-mirrored models.
+    public static let defaultModelsSubdirectory = "Models"
+
+    // MARK: - Files (local build dir layout)
+    public enum Files {
+        public static let llmPrefill = "LLM-Prefill-T256-M768-fp16.mlpackage"
+        public static let llmPrefillSubdir = "llm-fp16"
+        public static let llmDecode = "LLM-Decode-M768-fp16-stateful.mlpackage"
+        public static let llmDecodeSubdir = "llm-fp16-stateful"
+        public static let flow = "Flow-N250-fp16.mlpackage"
+        public static let flowSubdir = "flow-fp16-n250"
+        public static let hift = "HiFT-T500-fp16.mlpackage"
+        public static let hiftSubdir = "hift-fp16-t500"
+        public static let speechEmbeddings = "speech_embedding-fp16.safetensors"
+        public static let speechEmbeddingsSubdir = "embeddings"
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift
new file mode 100644
index 000000000..0ebe782f5
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Error.swift
@@ -0,0 +1,37 @@
+import Foundation
+
+/// Errors surfaced by the CosyVoice3 Swift pipeline.
+public enum CosyVoice3Error: LocalizedError, Sendable {
+    case notInitialized
+    case modelFileNotFound(String)
+    case invalidFixture(String)
+    case invalidSafetensors(String)
+    case prefillTooLong(Int)
+    case sequenceTooLong(Int)
+    case predictionFailed(String)
+    case embeddingTableMissing(String)
+    case invalidShape(String)
+
+    public var errorDescription: String? {
+        switch self {
+        case .notInitialized:
+            return "CosyVoice3 pipeline not initialized — call loadIfNeeded() first."
+        case .modelFileNotFound(let path):
+            return "CosyVoice3 model file not found at: \(path)"
+        case .invalidFixture(let reason):
+            return "Invalid CosyVoice3 fixture: \(reason)"
+        case .invalidSafetensors(let reason):
+            return "Invalid safetensors file: \(reason)"
+        case .prefillTooLong(let length):
+            return "Prefill sequence length \(length) exceeds max \(CosyVoice3Constants.prefillLength)"
+        case .sequenceTooLong(let length):
+            return "KV cache length \(length) exceeds max \(CosyVoice3Constants.kvMaxLength)"
+        case .predictionFailed(let stage):
+            return "CosyVoice3 prediction failed at stage: \(stage)"
+        case .embeddingTableMissing(let name):
+            return "CosyVoice3 embedding table missing: \(name)"
+        case .invalidShape(let detail):
+            return "CosyVoice3 shape mismatch: \(detail)"
+        }
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift
new file mode 100644
index 000000000..b608bdbfc
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3Models.swift
@@ -0,0 +1,23 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Four CoreML models for the CosyVoice3 inference pipeline.
+///
+/// `Sendable` conformance leans on `@preconcurrency import CoreML` (same
+/// pattern as `TtsModels`). `MLModel` is reference-typed but its predict
+/// surface is internally synchronized, and these instances are only handed
+/// to actors that own them for their lifetime, so crossing actor isolation
+/// is safe in practice.
+public struct CosyVoice3Models: Sendable {
+    public let prefill: MLModel
+    public let decode: MLModel
+    public let flow: MLModel
+    public let hift: MLModel
+
+    public init(prefill: MLModel, decode: MLModel, flow: MLModel, hift: MLModel) {
+        self.prefill = prefill
+        self.decode = decode
+        self.flow = flow
+        self.hift = hift
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift
new file mode 100644
index 000000000..d71f3ea67
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/CosyVoice3TtsManager.swift
@@ -0,0 +1,314 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Public entry point for the CosyVoice3 (Mandarin) TTS pipeline.
+///
+/// > Important: **Experimental / beta.** This backend ships as an early port
+/// > and end-to-end synthesis is currently **slow** on Apple Silicon —
+/// > expect well below real-time (RTFx < 1.0) on M-series GPUs and several
+/// > seconds of latency for short Mandarin utterances. The slowdown is
+/// > primarily in the Flow CFM stage, which is fp32/CPU-or-GPU only because
+/// > fp16 + ANE produces NaNs through the fused `layer_norm` (CoreMLTools
+/// > limitation; tracked upstream). The HiFT vocoder also has ~12 sinegen /
+/// > windowing ops that fall back to CPU. We do not yet know whether the
+/// > residual cost is fundamental to the model or recoverable through better
+/// > conversion — treat performance numbers as preliminary. The Swift API,
+/// > model layout, and prompt-asset format may change in subsequent
+/// > releases without deprecation aliases.
+///
+/// Two synthesis paths are exposed:
+///
+/// 1. `synthesizeFromFixture` — Phase 1 parity harness that replays a
+///    Python-generated fixture against the Swift CoreML pipeline.
+///
+/// 2. `synthesize(text:promptAssets:)` — Phase 2 text-driven synthesis. The
+///    user supplies a Mandarin `text` plus a `CosyVoice3PromptAssets` bundle
+///    (precomputed `llm_prompt_speech_ids`, `prompt_mel`, `spk_embedding`,
+///    plus the prompt text containing `<|endofprompt|>`). The manager
+///    tokenizes with the on-device Qwen2 BPE tokenizer, assembles
+///    `lm_input_embeds` from the mmap'd runtime embedding tables, and runs
+///    prefill → decode → Flow → HiFT exactly like the fixture path.
+///
+/// Text-mode requires three extra resources that must be provided at init:
+/// - `tokenizerDirectory`: HuggingFace Qwen2 assets (`vocab.json` + `merges.txt`).
+/// - `textEmbeddingsFile`: `embeddings-runtime-fp32.safetensors` produced by
+///   `mobius/.../verify/export_runtime_embeddings.py`. Contains Qwen2
+///   `text_embedding` and CosyVoice3 `speech_embedding` rows at runtime dtype.
+/// - `specialTokensFile`: JSON map `{"<|endofprompt|>": 151646, ...}` covering
+///   the 281 runtime-added special tokens (CosyVoice3Tokenizer). Same format
+///   that `tokenizer_fixture.json` dumps under its `special_tokens` key.
+///
+/// > Note: Gated to macOS 15 / iOS 18 because the underlying
+/// > `CosyVoice3Synthesizer` uses CoreML `MLState` for the decode KV cache.
+/// > Other FluidAudio modules (ASR, Diarization, VAD, Kokoro, PocketTTS)
+/// > remain available on macOS 14 / iOS 17.
+@available(macOS 15, iOS 18, *)
+public actor CosyVoice3TtsManager {
+
+    private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "CosyVoice3TtsManager")
+
+    private let store: CosyVoice3ModelStore
+    private let tokenizerDirectory: URL?
+    private let textEmbeddingsFile: URL?
+    private let specialTokensFile: URL?
+
+    private var synthesizer: CosyVoice3Synthesizer?
+    private var textFrontend: CosyVoice3TextFrontend?
+
+    /// Fixture-only (Phase 1) constructor.
+    public init(directory: URL, computeUnits: MLComputeUnits = .cpuAndNeuralEngine) {
+        self.store = CosyVoice3ModelStore(directory: directory, computeUnits: computeUnits)
+        self.tokenizerDirectory = nil
+        self.textEmbeddingsFile = nil
+        self.specialTokensFile = nil
+    }
+
+    /// Text-mode (Phase 2) constructor. Pass `modelsDirectory` plus the three
+    /// tokenizer-frontend resources. `synthesizeFromFixture` still works
+    /// without initializing the frontend.
+    public init(
+        modelsDirectory: URL,
+        tokenizerDirectory: URL,
+        textEmbeddingsFile: URL,
+        specialTokensFile: URL,
+        computeUnits: MLComputeUnits = .cpuAndNeuralEngine
+    ) {
+        self.store = CosyVoice3ModelStore(directory: modelsDirectory, computeUnits: computeUnits)
+        self.tokenizerDirectory = tokenizerDirectory
+        self.textEmbeddingsFile = textEmbeddingsFile
+        self.specialTokensFile = specialTokensFile
+    }
+
+    /// Convenience factory that downloads all required assets from HuggingFace
+    /// (`FluidInference/CosyVoice3-0.5B-coreml`) into the shared FluidAudio
+    /// cache, then returns a text-mode–ready manager.
+    ///
+    /// - Parameters:
+    ///   - cacheDirectory: Optional override for the base cache root. When
+    ///     `nil`, uses `~/.cache/fluidaudio` (macOS) or the app Caches dir
+    ///     (iOS) — the same location every other FluidAudio TTS backend uses.
+    ///   - includeDefaultVoice: When `true` (default), also fetches the
+    ///     upstream `cosyvoice3-default-zh` voice bundle so the first
+    ///     `synthesize(...)` call works without any additional downloads.
+    ///   - computeUnits: CoreML compute units for LLM + HiFT. Flow is forced
+    ///     to CPU+GPU regardless (fp32 graph, ANE would NaN on fused LN).
+    ///   - progressHandler: Forwarded to the HF downloader for UI updates.
+    /// - Returns: An uninitialized manager; the caller must still invoke
+    ///   `initialize()` to compile + load models. A download of ~5.8 GB occurs
+    ///   on first run; subsequent runs are cache hits.
+    public static func downloadAndCreate(
+        cacheDirectory: URL? = nil,
+        includeDefaultVoice: Bool = true,
+        computeUnits: MLComputeUnits = .cpuAndNeuralEngine,
+        progressHandler: DownloadUtils.ProgressHandler? = nil
+    ) async throws -> CosyVoice3TtsManager {
+        let repoDir = try await CosyVoice3ResourceDownloader.ensureCoreModels(
+            directory: cacheDirectory, progressHandler: progressHandler)
+        let frontend = try await CosyVoice3ResourceDownloader.ensureTextFrontendAssets(
+            repoDirectory: repoDir)
+        if includeDefaultVoice {
+            _ = try await CosyVoice3ResourceDownloader.ensureVoice(
+                repoDirectory: repoDir)
+        }
+        return CosyVoice3TtsManager(
+            modelsDirectory: repoDir,
+            tokenizerDirectory: frontend.tokenizerDirectory,
+            textEmbeddingsFile: frontend.runtimeEmbeddingsFile,
+            specialTokensFile: frontend.specialTokensFile,
+            computeUnits: computeUnits)
+    }
+
+    /// Ensure the given voice id (e.g. `"cosyvoice3-default-zh"` or an
+    /// `aishell3-zh-SSB####-{female|male}` id) is cached locally, and return
+    /// the loaded prompt bundle ready to pass into `synthesize(text:promptAssets:)`.
+    public func loadVoice(
+        _ voiceId: String = ModelNames.CosyVoice3.Sidecar.defaultVoiceId
+    ) async throws -> CosyVoice3PromptAssets {
+        let tensorsURL = try await CosyVoice3ResourceDownloader.ensureVoice(
+            voiceId: voiceId,
+            repoDirectory: modelsDirectory)
+        return try CosyVoice3PromptAssets.load(from: tensorsURL)
+    }
+
+    /// Repo root directory (cache location after `downloadAndCreate(...)`).
+    /// Pass this to `CosyVoice3ResourceDownloader.ensureVoice(voiceId:repoDirectory:)`
+    /// when fetching additional voice bundles on demand.
+    public nonisolated var modelsDirectory: URL {
+        store.directory
+    }
+
+    /// Load all four CoreML models + (if configured) the text frontend.
+    /// Idempotent.
+    public func initialize() async throws {
+        if synthesizer == nil {
+            logger.warning(
+                "CosyVoice3 is experimental / beta. Synthesis is currently slow "
+                    + "(RTFx < 1.0 typical) — see CosyVoice3TtsManager docs.")
+            try await store.loadIfNeeded()
+            let models = try await store.models()
+            let embeddingsURL = try await store.speechEmbeddingsFileURL()
+            let embeddings = try CosyVoice3SpeechEmbeddings(url: embeddingsURL)
+            self.synthesizer = CosyVoice3Synthesizer(models: models, embeddings: embeddings)
+            logger.info("CosyVoice3 synthesizer ready")
+        }
+        if textFrontend == nil,
+            let tokDir = tokenizerDirectory,
+            let embURL = textEmbeddingsFile,
+            let specURL = specialTokensFile
+        {
+            let tokStart = Date()
+            let specialTokens = try Self.loadSpecialTokens(url: specURL)
+            let tokenizer = try Qwen2BpeTokenizer.load(
+                directory: tokDir, specialTokens: specialTokens)
+            let textEmbeddings = try CosyVoice3TextEmbeddings(url: embURL)
+            self.textFrontend = CosyVoice3TextFrontend(
+                tokenizer: tokenizer, embeddings: textEmbeddings)
+            logger.info(
+                "CosyVoice3 text frontend ready in \(String(format: "%.2fs", Date().timeIntervalSince(tokStart)))"
+            )
+        }
+    }
+
+    /// Phase 1 parity entry point.
+    public func synthesizeFromFixture(
+        fixtureURL: URL,
+        options: CosyVoice3ParityOptions = CosyVoice3ParityOptions()
+    ) async throws -> CosyVoice3SynthesisResult {
+        guard let synthesizer = synthesizer else {
+            throw CosyVoice3Error.notInitialized
+        }
+        let fixture = try CosyVoice3FrontendFixture.load(from: fixtureURL)
+        return try await synthesizer.synthesize(fixture: fixture, options: options)
+    }
+
+    /// Phase 2 text-driven synthesis.
+    ///
+    /// - Parameters:
+    ///   - text: Mandarin (or mixed) input text.
+    ///   - promptAssets: Bundle with prompt text + precomputed speech prompt
+    ///     tokens + prompt mel + speaker embedding.
+    ///   - options: Sampling / seed controls. `replayDecodedTokens` must be
+    ///     `false` in text mode (the default here).
+    ///   - prenormalized: When `true`, skip the built-in minimal Chinese
+    ///     normalizer and feed `text` straight to the tokenizer. Set this if
+    ///     you've already run wetext (or equivalent) server-side.
+    public func synthesize(
+        text: String,
+        promptAssets: CosyVoice3PromptAssets,
+        options: CosyVoice3SynthesisOptions = CosyVoice3SynthesisOptions(),
+        prenormalized: Bool = false
+    ) async throws -> CosyVoice3SynthesisResult {
+        guard let synthesizer = synthesizer else {
+            throw CosyVoice3Error.notInitialized
+        }
+        guard let frontend = textFrontend else {
+            throw CosyVoice3Error.notInitialized
+        }
+
+        // Skip normalization if the caller set `prenormalized`, if the input
+        // contains SSML-ish markers (mirrors Python's `'<|' in text and '|>'`
+        // bypass), or if there are no CJK characters at all.
+        let ssmlLike = text.contains("<|") && text.contains("|>")
+        let normalized: String
+        if prenormalized || ssmlLike || !CosyVoice3ChineseNormalizer.containsChinese(text) {
+            normalized = text
+        } else {
+            normalized = CosyVoice3ChineseNormalizer.normalize(text)
+        }
+
+        let assembled = try frontend.assemble(
+            promptText: promptAssets.promptText,
+            ttsText: normalized,
+            promptSpeechIds: promptAssets.promptSpeechIds)
+
+        let lmInputEmbedsFlat = try Self.flattenLmEmbeds(
+            assembled.lmInputEmbeds, tPre: assembled.tPre)
+
+        // Build an in-memory fixture adapter so we can reuse the Phase 1
+        // synthesize(fixture:) path without a second code path.
+        let fixture = CosyVoice3FrontendFixture(
+            lmInputEmbeds: lmInputEmbedsFlat,
+            tPre: assembled.tPre,
+            promptSpeechIds: promptAssets.promptSpeechIds,
+            promptMel: promptAssets.promptMel,
+            promptMelFrames: promptAssets.promptMelFrames,
+            spkEmbedding: promptAssets.spkEmbedding,
+            decodedTokens: [],
+            seed: Int32(truncatingIfNeeded: options.seed),
+            numPromptMel: 0,
+            audioLengthSamples: 0)
+
+        let parityOptions = CosyVoice3ParityOptions(
+            maxNewTokens: options.maxNewTokens,
+            seed: options.seed,
+            replayDecodedTokens: false)
+
+        return try await synthesizer.synthesize(fixture: fixture, options: parityOptions)
+    }
+
+    // MARK: - Helpers
+
+    /// Flatten `[1, tPre, 896]` MLMultiArray fp32 into `[tPre * 896]` Float,
+    /// honoring non-compact strides.
+    private static func flattenLmEmbeds(
+        _ array: MLMultiArray, tPre: Int
+    ) throws -> [Float] {
+        guard
+            array.dataType == .float32,
+            array.shape.count == 3,
+            array.shape[0].intValue == 1,
+            array.shape[1].intValue == tPre,
+            array.shape[2].intValue == CosyVoice3Constants.embedDim
+        else {
+            throw CosyVoice3Error.invalidShape(
+                "lmInputEmbeds expects [1, \(tPre), \(CosyVoice3Constants.embedDim)] fp32, got shape=\(array.shape) dtype=\(array.dataType.rawValue)"
+            )
+        }
+        let dim = CosyVoice3Constants.embedDim
+        let strides = array.strides.map { $0.intValue }
+        let src = array.dataPointer.bindMemory(to: Float.self, capacity: array.count)
+        var out = [Float](repeating: 0, count: tPre * dim)
+        out.withUnsafeMutableBufferPointer { dst in
+            for t in 0..<tPre {
+                let srcRow = src.advanced(by: t * strides[1])
+                let dstRow = dst.baseAddress!.advanced(by: t * dim)
+                if strides[2] == 1 {
+                    memcpy(dstRow, srcRow, dim * MemoryLayout<Float>.size)
+                } else {
+                    for d in 0..<dim { dstRow[d] = srcRow[d * strides[2]] }
+                }
+            }
+        }
+        return out
+    }
+
+    private static func loadSpecialTokens(url: URL) throws -> [String: Int32] {
+        let data = try Data(contentsOf: url)
+        // Accept either the tokenizer_fixture.json shape
+        // ({"special_tokens": {...}, "cases": [...]}) or a flat map.
+        let json = try JSONSerialization.jsonObject(with: data)
+        let raw: [String: Any]
+        if let obj = json as? [String: Any], let nested = obj["special_tokens"] as? [String: Any] {
+            raw = nested
+        } else if let obj = json as? [String: Any] {
+            raw = obj
+        } else {
+            throw CosyVoice3Error.invalidShape(
+                "special tokens file must be a JSON object, got \(type(of: json))")
+        }
+        var out: [String: Int32] = [:]
+        out.reserveCapacity(raw.count)
+        for (k, v) in raw {
+            if let n = v as? Int {
+                out[k] = Int32(n)
+            } else if let n = v as? NSNumber {
+                out[k] = n.int32Value
+            }
+        }
+        guard !out.isEmpty else {
+            throw CosyVoice3Error.invalidShape(
+                "special tokens file parsed to an empty map at \(url.path)")
+        }
+        return out
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift
new file mode 100644
index 000000000..53457a8c1
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3ChineseNormalizer.swift
@@ -0,0 +1,145 @@
+import Foundation
+
+/// Minimal Mandarin text normalizer ported from CosyVoice's
+/// `cosyvoice/utils/frontend_utils.py` + the Chinese branch of
+/// `cosyvoice/cli/frontend.py:text_normalize`.
+///
+/// **Scope (intentional):** regex-free character-level rules plus digit
+/// spellout. The full `wetext.ZhNormalizer` (which rewrites years, phone
+/// numbers, decimals, units, chemistry, currency, dates…) is **not** ported.
+/// Callers that need production-quality TN should run wetext server-side and
+/// pass the result via `synthesize(text:prenormalized: true, ...)`.
+///
+/// Rules applied (in order):
+///   1. strip newlines, leading/trailing whitespace
+///   2. `replaceCornerMark` — `²` → `平方`, `³` → `立方`
+///   3. ASCII digits → 零一二三四五六七八九 (per-digit fallback; lossy vs wetext
+///      but avoids raw Arabic numerals going into the BPE)
+///   4. `.` → `。`, ` - ` → `，`
+///   5. `replaceBlank` — remove spaces between CJK chars; keep spaces between
+///      ASCII tokens. Runs *after* the ASCII→CJK substitutions above so
+///      spaces that became CJK-interior are also cleaned up.
+///   6. `removeBracket` — drop `（）【】` and backticks, `——` → space
+///   7. trailing `，` / `,` / `、` sequences → `。`
+public enum CosyVoice3ChineseNormalizer {
+
+    public static func normalize(_ text: String) -> String {
+        var s = text
+        s = s.replacingOccurrences(of: "\n", with: "")
+        s = s.trimmingCharacters(in: .whitespaces)
+        s = replaceCornerMark(s)
+        s = spellOutDigitsZh(s)
+        s = s.replacingOccurrences(of: ".", with: "。")
+        s = s.replacingOccurrences(of: " - ", with: "，")
+        s = replaceBlank(s)
+        s = removeBracket(s)
+        s = stripTrailingCommaLikes(s)
+        return s
+    }
+
+    /// True if `text` contains at least one CJK Unified Ideograph
+    /// (U+4E00..U+9FFF), matching `contains_chinese` in frontend_utils.py.
+    public static func containsChinese(_ text: String) -> Bool {
+        for scalar in text.unicodeScalars where (0x4E00...0x9FFF).contains(scalar.value) {
+            return true
+        }
+        return false
+    }
+
+    /// True if `text` is empty or consists only of Unicode punctuation /
+    /// symbol characters. Mirrors `is_only_punctuation`.
+    public static func isOnlyPunctuation(_ text: String) -> Bool {
+        if text.isEmpty { return true }
+        let allowed: CharacterSet = {
+            var s = CharacterSet.punctuationCharacters
+            s.formUnion(.symbols)
+            s.formUnion(.whitespaces)
+            return s
+        }()
+        for scalar in text.unicodeScalars where !allowed.contains(scalar) {
+            return false
+        }
+        return true
+    }
+
+    // MARK: - Individual rules
+
+    /// Drop spaces between non-ASCII chars; keep spaces that sit between two
+    /// ASCII tokens (e.g. "hello world" stays, "中 国" → "中国").
+    static func replaceBlank(_ text: String) -> String {
+        let chars = Array(text)
+        var out: [Character] = []
+        out.reserveCapacity(chars.count)
+        for i in 0..<chars.count {
+            let c = chars[i]
+            if c == " " {
+                let prev = i > 0 ? chars[i - 1] : Character(" ")
+                let next = i + 1 < chars.count ? chars[i + 1] : Character(" ")
+                let prevOk = prev.isASCII && prev != " "
+                let nextOk = next.isASCII && next != " "
+                if prevOk && nextOk {
+                    out.append(c)
+                }
+            } else {
+                out.append(c)
+            }
+        }
+        return String(out)
+    }
+
+    static func replaceCornerMark(_ text: String) -> String {
+        var s = text
+        s = s.replacingOccurrences(of: "²", with: "平方")
+        s = s.replacingOccurrences(of: "³", with: "立方")
+        return s
+    }
+
+    static func removeBracket(_ text: String) -> String {
+        var s = text
+        s = s.replacingOccurrences(of: "（", with: "")
+        s = s.replacingOccurrences(of: "）", with: "")
+        s = s.replacingOccurrences(of: "【", with: "")
+        s = s.replacingOccurrences(of: "】", with: "")
+        s = s.replacingOccurrences(of: "`", with: "")
+        s = s.replacingOccurrences(of: "——", with: " ")
+        return s
+    }
+
+    /// Replace each ASCII digit in `text` with its Chinese reading. Lossy
+    /// per-digit fallback (e.g. `2024` → `二零二四`); correct for years / IDs
+    /// but wrong for decimals or large cardinals. Acceptable as a placeholder
+    /// while wetext remains server-side.
+    static func spellOutDigitsZh(_ text: String) -> String {
+        let map: [Character: String] = [
+            "0": "零", "1": "一", "2": "二", "3": "三", "4": "四",
+            "5": "五", "6": "六", "7": "七", "8": "八", "9": "九",
+        ]
+        var out = ""
+        out.reserveCapacity(text.count)
+        for ch in text {
+            if let zh = map[ch] {
+                out += zh
+            } else {
+                out.append(ch)
+            }
+        }
+        return out
+    }
+
+    /// Collapse a run of trailing `，` / `,` / `、` into a single `。`.
+    /// Equivalent to the Python `re.sub(r'[，,、]+$', '。', text)` rule.
+    static func stripTrailingCommaLikes(_ text: String) -> String {
+        let commaLikes: Set<Character> = ["，", ",", "、"]
+        var chars = Array(text)
+        var end = chars.count
+        while end > 0, commaLikes.contains(chars[end - 1]) {
+            end -= 1
+        }
+        if end == chars.count {
+            return text
+        }
+        chars = Array(chars[0..<end])
+        chars.append("。")
+        return String(chars)
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3FrontendFixture.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3FrontendFixture.swift
new file mode 100644
index 000000000..e2c57555c
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3FrontendFixture.swift
@@ -0,0 +1,109 @@
+import Foundation
+
+/// Assembled frontend output fed to `CosyVoice3Synthesizer`.
+///
+/// This is the intermediate state between the text frontend and the
+/// LLM/Flow/HiFT pipeline — every entry point produces one of these:
+///
+/// - **Phase 2 (production):** `CosyVoice3TtsManager.synthesize(text:promptAssets:)`
+///   builds an in-memory instance from a tokenized + embedded `lm_input_embeds`
+///   plus the caller's `CosyVoice3PromptAssets`.
+/// - **Phase 1 (parity harness):** `load(from:)` reads a `.safetensors` produced
+///   by `mobius/models/tts/cosyvoice3/coreml/verify/export_swift_fixture.py`
+///   so the Swift synthesizer can be diffed against a Python golden run.
+///
+/// Both paths converge on `CosyVoice3Synthesizer.synthesize(fixture:)`.
+public struct CosyVoice3FrontendFixture: Sendable {
+    /// LLM prefill `inputs_embeds` — shape `[1, tPre, 896]` fp32.
+    public let lmInputEmbeds: [Float]
+    public let tPre: Int
+
+    /// Speech-token prompt fed to Flow's `token_total` prefix.
+    public let promptSpeechIds: [Int32]
+
+    /// Prompt mel — shape `[1, promptMelFrames, 80]` fp32.
+    public let promptMel: [Float]
+    public let promptMelFrames: Int
+
+    /// Speaker embedding — shape `[1, 192]` fp32.
+    public let spkEmbedding: [Float]
+
+    /// Python-captured decoded token stream (used for seeded parity playback).
+    public let decodedTokens: [Int32]
+
+    public let seed: Int32
+    public let numPromptMel: Int
+    public let audioLengthSamples: Int
+
+    public static func load(from url: URL) throws -> CosyVoice3FrontendFixture {
+        let file = try SafetensorsFile(url: url)
+
+        let lmInfo = try file.info("lm_input_embeds")
+        guard
+            lmInfo.dtype == .f32,
+            lmInfo.shape.count == 3,
+            lmInfo.shape[0] == 1,
+            lmInfo.shape[2] == CosyVoice3Constants.embedDim
+        else {
+            throw CosyVoice3Error.invalidFixture(
+                "lm_input_embeds expects [1, t_pre, 896] fp32, got shape=\(lmInfo.shape) dtype=\(lmInfo.dtype.rawValue)"
+            )
+        }
+        let lmInputEmbeds = try file.asFloat32("lm_input_embeds")
+        let tPre = lmInfo.shape[1]
+        guard tPre > 0 && tPre <= CosyVoice3Constants.prefillLength else {
+            throw CosyVoice3Error.prefillTooLong(tPre)
+        }
+
+        let promptIdsInfo = try file.info("llm_prompt_speech_ids")
+        guard
+            promptIdsInfo.shape.count == 2,
+            promptIdsInfo.shape[0] == 1
+        else {
+            throw CosyVoice3Error.invalidFixture(
+                "llm_prompt_speech_ids expects [1, N], got \(promptIdsInfo.shape)")
+        }
+        let promptSpeechIds = try file.asInt32("llm_prompt_speech_ids")
+
+        let promptMelInfo = try file.info("prompt_mel")
+        guard
+            promptMelInfo.dtype == .f32,
+            promptMelInfo.shape.count == 3,
+            promptMelInfo.shape[0] == 1,
+            promptMelInfo.shape[2] == CosyVoice3Constants.melBins
+        else {
+            throw CosyVoice3Error.invalidFixture(
+                "prompt_mel expects [1, frames, 80] fp32, got \(promptMelInfo.shape)")
+        }
+        let promptMel = try file.asFloat32("prompt_mel")
+        let promptMelFrames = promptMelInfo.shape[1]
+
+        let spkInfo = try file.info("spk_embedding")
+        guard
+            spkInfo.dtype == .f32,
+            spkInfo.shape == [1, CosyVoice3Constants.speakerEmbeddingDim]
+        else {
+            throw CosyVoice3Error.invalidFixture(
+                "spk_embedding expects [1, 192] fp32, got \(spkInfo.shape)")
+        }
+        let spkEmbedding = try file.asFloat32("spk_embedding")
+
+        let decodedTokens = try file.asInt32("decoded_tokens")
+        let seedValue = try file.asInt32("seed").first ?? 0
+
+        let numPromptMel = try file.asInt("num_prompt_mel")
+        let audioLengthSamples = try file.asInt("audio_length_samples")
+
+        return CosyVoice3FrontendFixture(
+            lmInputEmbeds: lmInputEmbeds,
+            tPre: tPre,
+            promptSpeechIds: promptSpeechIds,
+            promptMel: promptMel,
+            promptMelFrames: promptMelFrames,
+            spkEmbedding: spkEmbedding,
+            decodedTokens: decodedTokens,
+            seed: seedValue,
+            numPromptMel: numPromptMel,
+            audioLengthSamples: audioLengthSamples)
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift
new file mode 100644
index 000000000..0c10cd203
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptAssets.swift
@@ -0,0 +1,115 @@
+import Foundation
+
+/// Zero-shot prompt assets bundled alongside CosyVoice3 inference.
+///
+/// Phase 2 keeps SpeechTokenizer and CAMPPlus Python-side: `llmPromptSpeechIds`
+/// and `spkEmbedding` are precomputed from a reference prompt WAV and shipped
+/// as a single safetensors file with a JSON sidecar carrying the prompt text.
+/// A later phase will regenerate these on-device once the SpeechTokenizer and
+/// CAMPPlus DSPs + CoreML bindings land.
+///
+/// The shipping layout mirrors what
+/// `verify/export_swift_fixture.py` produces, so the Phase 1 fixture doubles
+/// as a valid prompt-assets bundle:
+///
+/// ```
+/// <bundle>.safetensors
+///     llm_prompt_speech_ids  int32   [1, N_speech]
+///     prompt_mel             float32 [1, 2*N_speech, 80]
+///     spk_embedding          float32 [1, 192]
+///     (any other tensors are ignored)
+/// <bundle>.json
+///     { "prompt_text": "...", "tts_text": "..." }
+/// ```
+public struct CosyVoice3PromptAssets: Sendable {
+
+    /// Prompt text seed. MUST contain `<|endofprompt|>` (id 151646).
+    public let promptText: String
+
+    /// Discrete speech token prefix fed to Flow (`token_total[:, :N_speech]`)
+    /// AND used to build the LLM prefill embed table.
+    public let promptSpeechIds: [Int32]
+
+    /// Mel frames computed from the prompt WAV (`[1, 2*N_speech, 80]` fp32).
+    /// Flattened row-major `[frames * 80]`; `promptMelFrames` is the frame count.
+    public let promptMel: [Float]
+    public let promptMelFrames: Int
+
+    /// CAMPPlus speaker embedding for the prompt voice (`[1, 192]` fp32).
+    public let spkEmbedding: [Float]
+
+    public init(
+        promptText: String,
+        promptSpeechIds: [Int32],
+        promptMel: [Float],
+        promptMelFrames: Int,
+        spkEmbedding: [Float]
+    ) {
+        self.promptText = promptText
+        self.promptSpeechIds = promptSpeechIds
+        self.promptMel = promptMel
+        self.promptMelFrames = promptMelFrames
+        self.spkEmbedding = spkEmbedding
+    }
+
+    /// Load from `<bundle>.safetensors` + `<bundle>.json` sidecar.
+    ///
+    /// - Parameter url: URL to the `.safetensors` file. The sidecar is expected
+    ///   next to it with the same basename and `.json` extension.
+    public static func load(from url: URL) throws -> CosyVoice3PromptAssets {
+        let file = try SafetensorsFile(url: url)
+
+        let idsInfo = try file.info("llm_prompt_speech_ids")
+        guard idsInfo.shape.count == 2, idsInfo.shape[0] == 1 else {
+            throw CosyVoice3Error.invalidFixture(
+                "llm_prompt_speech_ids expects [1, N], got \(idsInfo.shape)")
+        }
+        let promptSpeechIds = try file.asInt32("llm_prompt_speech_ids")
+
+        let melInfo = try file.info("prompt_mel")
+        guard
+            melInfo.dtype == .f32,
+            melInfo.shape.count == 3,
+            melInfo.shape[0] == 1,
+            melInfo.shape[2] == CosyVoice3Constants.melBins
+        else {
+            throw CosyVoice3Error.invalidFixture(
+                "prompt_mel expects [1, frames, 80] fp32, got \(melInfo.shape)")
+        }
+        let promptMel = try file.asFloat32("prompt_mel")
+        let promptMelFrames = melInfo.shape[1]
+
+        let spkInfo = try file.info("spk_embedding")
+        guard
+            spkInfo.dtype == .f32,
+            spkInfo.shape == [1, CosyVoice3Constants.speakerEmbeddingDim]
+        else {
+            throw CosyVoice3Error.invalidFixture(
+                "spk_embedding expects [1, 192] fp32, got \(spkInfo.shape)")
+        }
+        let spkEmbedding = try file.asFloat32("spk_embedding")
+
+        let sidecarURL = url.deletingPathExtension().appendingPathExtension("json")
+        guard FileManager.default.fileExists(atPath: sidecarURL.path) else {
+            throw CosyVoice3Error.invalidFixture(
+                "prompt sidecar JSON not found next to \(url.lastPathComponent) — expected \(sidecarURL.lastPathComponent)"
+            )
+        }
+        struct Sidecar: Decodable { let prompt_text: String }
+        let sidecar: Sidecar
+        do {
+            sidecar = try JSONDecoder().decode(
+                Sidecar.self, from: try Data(contentsOf: sidecarURL))
+        } catch {
+            throw CosyVoice3Error.invalidFixture(
+                "failed to decode \(sidecarURL.lastPathComponent): \(error)")
+        }
+
+        return CosyVoice3PromptAssets(
+            promptText: sidecar.prompt_text,
+            promptSpeechIds: promptSpeechIds,
+            promptMel: promptMel,
+            promptMelFrames: promptMelFrames,
+            spkEmbedding: spkEmbedding)
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift
new file mode 100644
index 000000000..0e3f8a196
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3PromptMel.swift
@@ -0,0 +1,307 @@
+import Accelerate
+import Foundation
+
+/// On-device mel spectrogram extractor for CosyVoice3 prompt audio.
+///
+/// Matches `matcha.utils.audio.mel_spectrogram` invoked from
+/// `cosyvoice/cli/frontend.py:_extract_speech_feat` with the CosyVoice3 config
+/// (see `examples/libritts/cosyvoice3/conf/cosyvoice3.yaml`):
+///
+/// ```
+/// n_fft: 1920
+/// num_mels: 80
+/// sampling_rate: 24000
+/// hop_size: 480
+/// win_size: 1920
+/// fmin: 0
+/// fmax: null  (→ sampling_rate / 2 = 12000 per librosa default)
+/// center: False
+/// ```
+///
+/// Pipeline (verbatim from the Python reference):
+///   1. reflect-pad the waveform by `(n_fft - hop_size) / 2 = 720` on each side
+///   2. framed STFT with `n_fft=1920, hop=480, win=1920`, periodic Hann window
+///      (`torch.hann_window` default), `center=False`
+///   3. magnitude = `sqrt(real² + imag² + 1e-9)`   (Matcha convention)
+///   4. `mel = mel_basis @ magnitude` using Slaney-normalized mel filterbank
+///      (librosa default: HTK=False, norm='slaney')
+///   5. `log_mel = log(clamp(mel, min=1e-5))`
+///
+/// The output is flattened `[T, 80]` row-major fp32, which is the layout
+/// `CosyVoice3PromptAssets.promptMel` stores and the Flow model consumes as
+/// `[1, 2*N_speech, 80]` after slicing to match the prompt-speech id count.
+///
+/// Use `trimToTokenRatio(...)` to enforce the `frames == 2 * N_speech`
+/// invariant before passing to Flow (matches the
+/// `speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len`
+/// clamp in the Python frontend).
+public final class CosyVoice3PromptMel {
+
+    public static let sampleRate = 24_000
+    public static let nFFT = 1_920
+    public static let hopSize = 480
+    public static let winSize = 1_920
+    public static let numMels = 80
+    public static let fMin: Float = 0
+    public static let fMax: Float = 12_000  // sr / 2
+    /// Reflect-pad each side by `(n_fft - hop_size) / 2`.
+    public static let padLength = (nFFT - hopSize) / 2  // 720
+    /// Magnitude epsilon before sqrt (prevents NaN gradients in training; kept
+    /// here for bit parity with the reference).
+    private static let magEps: Float = 1e-9
+    /// Log floor clamp applied inside `log(clamp(x, min=1e-5))`.
+    private static let logFloor: Float = 1e-5
+
+    // Precomputed resources
+    private let hannWindow: [Float]
+    private let melBasis: [Float]  // flat [numMels * numFreqBins]
+    private let numFreqBins: Int
+    private var fftSetup: vDSP_DFT_Setup?
+
+    // Reusable buffers (not thread-safe; wrap with a queue if shared).
+    private var frameBuf: [Float]
+    private var realIn: [Float]
+    private var imagIn: [Float]
+    private var realOut: [Float]
+    private var imagOut: [Float]
+    private var magnitude: [Float]
+    private var imagSq: [Float]
+
+    public init() {
+        self.numFreqBins = Self.nFFT / 2 + 1
+        // torch.hann_window(N) defaults to periodic=True — sample i of length
+        // N is `0.5 * (1 - cos(2πi/N))`. This matches Matcha's code path via
+        // the torch.stft default.
+        self.hannWindow = Self.hannWindowPeriodic(length: Self.winSize)
+        self.melBasis = Self.buildSlaneyMelBasis(
+            sampleRate: Self.sampleRate,
+            nFFT: Self.nFFT,
+            numMels: Self.numMels,
+            fMin: Self.fMin,
+            fMax: Self.fMax)
+        self.fftSetup = vDSP_DFT_zop_CreateSetup(nil, vDSP_Length(Self.nFFT), .FORWARD)
+        self.frameBuf = [Float](repeating: 0, count: Self.nFFT)
+        self.realIn = [Float](repeating: 0, count: Self.nFFT)
+        self.imagIn = [Float](repeating: 0, count: Self.nFFT)
+        self.realOut = [Float](repeating: 0, count: Self.nFFT)
+        self.imagOut = [Float](repeating: 0, count: Self.nFFT)
+        self.magnitude = [Float](repeating: 0, count: numFreqBins)
+        self.imagSq = [Float](repeating: 0, count: numFreqBins)
+    }
+
+    deinit {
+        if let setup = fftSetup {
+            vDSP_DFT_DestroySetup(setup)
+        }
+    }
+
+    public struct Result: Sendable {
+        /// `[frames * numMels]` row-major, fp32.
+        public let mel: [Float]
+        public let frames: Int
+    }
+
+    /// Compute the log-mel spectrogram for a 24 kHz mono waveform.
+    ///
+    /// - Parameter audio: fp32 PCM samples at 24 kHz, range ≈ [-1, 1].
+    /// - Returns: `[T * 80]` row-major fp32 mel, where
+    ///   `T = floor((len + 2·padLength - nFFT) / hopSize) + 1`.
+    public func compute(audio: [Float]) throws -> Result {
+        guard let setup = fftSetup else {
+            throw CosyVoice3Error.invalidShape("vDSP_DFT setup failed")
+        }
+        guard audio.count > 0 else {
+            return Result(mel: [], frames: 0)
+        }
+
+        let padded = Self.reflectPad(audio, pad: Self.padLength)
+        let paddedCount = padded.count
+        let frames = max(0, (paddedCount - Self.nFFT) / Self.hopSize + 1)
+        guard frames > 0 else {
+            return Result(mel: [], frames: 0)
+        }
+
+        var mel = [Float](repeating: 0, count: frames * Self.numMels)
+
+        for frameIdx in 0..<frames {
+            let start = frameIdx * Self.hopSize
+
+            // Window the frame: frameBuf[i] = padded[start+i] * hann[i].
+            padded.withUnsafeBufferPointer { paddedPtr in
+                hannWindow.withUnsafeBufferPointer { hannPtr in
+                    frameBuf.withUnsafeMutableBufferPointer { fPtr in
+                        vDSP_vmul(
+                            paddedPtr.baseAddress! + start, 1,
+                            hannPtr.baseAddress!, 1,
+                            fPtr.baseAddress!, 1,
+                            vDSP_Length(Self.winSize))
+                    }
+                }
+            }
+
+            // FFT. realIn ← frameBuf, imagIn ← 0.
+            frameBuf.withUnsafeBufferPointer { src in
+                realIn.withUnsafeMutableBufferPointer { dst in
+                    memcpy(dst.baseAddress!, src.baseAddress!, Self.nFFT * MemoryLayout<Float>.size)
+                }
+            }
+            vDSP_vclr(&imagIn, 1, vDSP_Length(Self.nFFT))
+            vDSP_DFT_Execute(setup, realIn, imagIn, &realOut, &imagOut)
+
+            // magnitude = sqrt(real² + imag² + 1e-9) over one-sided bins.
+            vDSP_vsq(realOut, 1, &magnitude, 1, vDSP_Length(numFreqBins))
+            vDSP_vsq(imagOut, 1, &imagSq, 1, vDSP_Length(numFreqBins))
+            vDSP_vadd(magnitude, 1, imagSq, 1, &magnitude, 1, vDSP_Length(numFreqBins))
+            var eps = Self.magEps
+            vDSP_vsadd(magnitude, 1, &eps, &magnitude, 1, vDSP_Length(numFreqBins))
+            var n = Int32(numFreqBins)
+            vvsqrtf(&magnitude, magnitude, &n)
+
+            // mel = melBasis[80, numFreqBins] @ magnitude[numFreqBins]
+            var melFrame = [Float](repeating: 0, count: Self.numMels)
+            melBasis.withUnsafeBufferPointer { basisPtr in
+                magnitude.withUnsafeBufferPointer { magPtr in
+                    melFrame.withUnsafeMutableBufferPointer { outPtr in
+                        vDSP_mmul(
+                            basisPtr.baseAddress!, 1,
+                            magPtr.baseAddress!, 1,
+                            outPtr.baseAddress!, 1,
+                            vDSP_Length(Self.numMels),
+                            vDSP_Length(1),
+                            vDSP_Length(numFreqBins))
+                    }
+                }
+            }
+
+            // log(clamp(x, min=1e-5))
+            for m in 0..<Self.numMels {
+                let clamped = max(melFrame[m], Self.logFloor)
+                mel[frameIdx * Self.numMels + m] = log(clamped)
+            }
+        }
+
+        return Result(mel: mel, frames: frames)
+    }
+
+    /// Enforce `frames == 2 * tokenCount`. Trims excess frames if needed; if
+    /// the mel is shorter than `2 * tokenCount`, an error is thrown (callers
+    /// should ensure the prompt WAV is long enough for its token count).
+    public static func trimToTokenRatio(
+        mel: [Float], frames: Int, tokenCount: Int
+    ) throws -> (mel: [Float], frames: Int) {
+        let targetFrames = 2 * tokenCount
+        guard frames >= targetFrames else {
+            throw CosyVoice3Error.invalidShape(
+                "prompt mel has \(frames) frames but tokenCount=\(tokenCount) requires \(targetFrames)"
+            )
+        }
+        if frames == targetFrames {
+            return (mel, frames)
+        }
+        let trimmed = Array(mel.prefix(targetFrames * numMels))
+        return (trimmed, targetFrames)
+    }
+
+    // MARK: - Helpers
+
+    /// PyTorch `F.pad(..., mode="reflect")` on a 1-D signal:
+    ///   - left:  [y[pad], y[pad-1], ..., y[1]]
+    ///   - core:  y[0..<N]
+    ///   - right: [y[N-2], y[N-3], ..., y[N-1-pad]]
+    /// Reflection excludes the endpoint (matches librosa / numpy reflect).
+    static func reflectPad(_ y: [Float], pad: Int) -> [Float] {
+        let n = y.count
+        if pad <= 0 { return y }
+        // PyTorch requires pad < n for reflect. Guard loudly for a silently
+        // bad prompt (very short audio).
+        precondition(pad < n, "reflect pad=\(pad) requires signal length > \(pad), got \(n)")
+        var out = [Float](repeating: 0, count: n + 2 * pad)
+        for i in 0..<pad {
+            out[i] = y[pad - i]
+        }
+        for i in 0..<n {
+            out[pad + i] = y[i]
+        }
+        for i in 0..<pad {
+            out[pad + n + i] = y[n - 2 - i]
+        }
+        return out
+    }
+
+    /// `torch.hann_window(N)` (periodic=True): sample i of length N is
+    /// `0.5 * (1 - cos(2πi / N))`.
+    static func hannWindowPeriodic(length: Int) -> [Float] {
+        var w = [Float](repeating: 0, count: length)
+        let divisor = Float(length)
+        for i in 0..<length {
+            w[i] = 0.5 * (1.0 - cos(2.0 * Float.pi * Float(i) / divisor))
+        }
+        return w
+    }
+
+    /// Build a `[numMels, numFFT/2 + 1]` row-major mel filterbank matching
+    /// `librosa.filters.mel(sr, n_fft, n_mels, fmin, fmax)` defaults:
+    /// HTK=False (Slaney mel), norm='slaney' (triangle area = 2/(f_right−f_left)).
+    static func buildSlaneyMelBasis(
+        sampleRate: Int,
+        nFFT: Int,
+        numMels: Int,
+        fMin: Float,
+        fMax: Float
+    ) -> [Float] {
+        let numFreqBins = nFFT / 2 + 1
+
+        let melMin = hzToMelSlaney(fMin)
+        let melMax = hzToMelSlaney(fMax)
+
+        var melPoints = [Float](repeating: 0, count: numMels + 2)
+        for i in 0..<(numMels + 2) {
+            let mel = melMin + Float(i) * (melMax - melMin) / Float(numMels + 1)
+            melPoints[i] = melToHzSlaney(mel)
+        }
+
+        var fftFreqs = [Float](repeating: 0, count: numFreqBins)
+        for i in 0..<numFreqBins {
+            fftFreqs[i] = Float(i) * Float(sampleRate) / Float(nFFT)
+        }
+
+        var basis = [Float](repeating: 0, count: numMels * numFreqBins)
+        for m in 0..<numMels {
+            let fLeft = melPoints[m]
+            let fCenter = melPoints[m + 1]
+            let fRight = melPoints[m + 2]
+            let norm = 2.0 / (fRight - fLeft)
+            for f in 0..<numFreqBins {
+                let freq = fftFreqs[f]
+                var w: Float = 0
+                if freq >= fLeft && freq < fCenter {
+                    w = norm * (freq - fLeft) / (fCenter - fLeft)
+                } else if freq >= fCenter && freq <= fRight {
+                    w = norm * (fRight - freq) / (fRight - fCenter)
+                }
+                basis[m * numFreqBins + f] = w
+            }
+        }
+        return basis
+    }
+
+    static func hzToMelSlaney(_ hz: Float) -> Float {
+        let fSp: Float = 200.0 / 3.0
+        let minLogHz: Float = 1_000.0
+        let minLogMel: Float = minLogHz / fSp
+        let logStep: Float = log(6.4) / 27.0
+        return hz >= minLogHz
+            ? minLogMel + log(hz / minLogHz) / logStep
+            : hz / fSp
+    }
+
+    static func melToHzSlaney(_ mel: Float) -> Float {
+        let fSp: Float = 200.0 / 3.0
+        let minLogHz: Float = 1_000.0
+        let minLogMel: Float = minLogHz / fSp
+        let logStep: Float = log(6.4) / 27.0
+        return mel >= minLogMel
+            ? minLogHz * exp(logStep * (mel - minLogMel))
+            : fSp * mel
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift
new file mode 100644
index 000000000..54aeaefff
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextEmbeddings.swift
@@ -0,0 +1,142 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// mmap'd reader for Qwen2 `text_embedding` [151936, 896] and CosyVoice3
+/// `speech_embedding` [6761, 896] tables (both fp32). Used by the Phase 2
+/// text frontend to assemble `lm_input_embeds` natively in Swift.
+///
+/// The Phase 1 per-step decode embedding path still uses
+/// `CosyVoice3SpeechEmbeddings` (fp16 table) to save memory during long
+/// autoregressive loops; that code remains unchanged.
+public final class CosyVoice3TextEmbeddings {
+
+    private let file: SafetensorsFile
+    private let textBytes: Data
+    private let speechBytes: Data
+    public let textVocab: Int
+    public let speechVocab: Int
+    public let embedDim: Int
+
+    public init(url: URL) throws {
+        let file = try SafetensorsFile(url: url)
+        guard let text = file.tensors["text_embedding"] else {
+            throw CosyVoice3Error.embeddingTableMissing("text_embedding")
+        }
+        guard let speech = file.tensors["speech_embedding"] else {
+            throw CosyVoice3Error.embeddingTableMissing("speech_embedding")
+        }
+        guard text.dtype == .f32, text.shape.count == 2 else {
+            throw CosyVoice3Error.invalidShape(
+                "text_embedding expects [vocab, 896] fp32, got shape=\(text.shape) dtype=\(text.dtype.rawValue)"
+            )
+        }
+        guard speech.dtype == .f32, speech.shape.count == 2 else {
+            throw CosyVoice3Error.invalidShape(
+                "speech_embedding expects [vocab, 896] fp32, got shape=\(speech.shape) dtype=\(speech.dtype.rawValue)"
+            )
+        }
+        guard text.shape[1] == speech.shape[1] else {
+            throw CosyVoice3Error.invalidShape(
+                "text_embedding dim=\(text.shape[1]) != speech_embedding dim=\(speech.shape[1])"
+            )
+        }
+        self.file = file
+        self.textBytes = try file.rawBytes("text_embedding")
+        self.speechBytes = try file.rawBytes("speech_embedding")
+        self.textVocab = text.shape[0]
+        self.speechVocab = speech.shape[0]
+        self.embedDim = text.shape[1]
+        guard self.embedDim == CosyVoice3Constants.embedDim else {
+            throw CosyVoice3Error.invalidShape(
+                "embed_dim=\(embedDim) does not match CosyVoice3Constants.embedDim=\(CosyVoice3Constants.embedDim)"
+            )
+        }
+    }
+
+    /// Assemble LLM-Prefill input:
+    /// `lm_input = concat([sos, text_embedding[text_ids], task_id, speech_embedding[prompt_speech_ids]], dim=1)`
+    ///
+    /// Returns a `[1, T_pre, 896]` fp32 MLMultiArray and `T_pre = 1 + N_text + 1 + N_speech`.
+    /// The LLM-Prefill model expects T padded to 256; this method returns the
+    /// unpadded tensor — callers must pad or pass `T_pre` separately.
+    public func assembleLmInput(
+        textTokenIds: [Int32],
+        promptSpeechIds: [Int32],
+        sos: Int32 = CosyVoice3Constants.sosId,
+        taskId: Int32 = CosyVoice3Constants.taskId
+    ) throws -> (embeds: MLMultiArray, tPre: Int) {
+        let nText = textTokenIds.count
+        let nSpeech = promptSpeechIds.count
+        let tPre = 1 + nText + 1 + nSpeech
+        let dim = embedDim
+        let array = try MLMultiArray(
+            shape: [1, NSNumber(value: tPre), NSNumber(value: dim)],
+            dataType: .float32)
+        let strides = array.strides.map { $0.intValue }
+        let dst = array.dataPointer.bindMemory(to: Float.self, capacity: array.count)
+
+        // Row t (within the T_pre axis) → destination pointer.
+        func row(_ t: Int) -> UnsafeMutablePointer<Float> {
+            dst.advanced(by: t * strides[1])
+        }
+
+        // 1) sos
+        try copySpeechRow(sos, into: row(0), stride: strides[2])
+        // 2) text_embedding[text_ids]
+        for (i, id) in textTokenIds.enumerated() {
+            try copyTextRow(id, into: row(1 + i), stride: strides[2])
+        }
+        // 3) task_id
+        try copySpeechRow(taskId, into: row(1 + nText), stride: strides[2])
+        // 4) speech_embedding[prompt_speech_ids]
+        for (i, id) in promptSpeechIds.enumerated() {
+            try copySpeechRow(id, into: row(1 + nText + 1 + i), stride: strides[2])
+        }
+
+        return (array, tPre)
+    }
+
+    // MARK: - Row copy
+
+    private func copyTextRow(
+        _ id: Int32, into dst: UnsafeMutablePointer<Float>, stride: Int
+    ) throws {
+        guard id >= 0 && Int(id) < textVocab else {
+            throw CosyVoice3Error.invalidShape(
+                "text token id \(id) out of range [0, \(textVocab))")
+        }
+        let rowStart = Int(id) * embedDim * 4
+        textBytes.withUnsafeBytes { src in
+            let basePtr = src.baseAddress!.advanced(by: rowStart)
+                .assumingMemoryBound(to: Float.self)
+            if stride == 1 {
+                memcpy(dst, basePtr, embedDim * 4)
+            } else {
+                for i in 0..<embedDim {
+                    dst[i * stride] = basePtr[i]
+                }
+            }
+        }
+    }
+
+    private func copySpeechRow(
+        _ id: Int32, into dst: UnsafeMutablePointer<Float>, stride: Int
+    ) throws {
+        guard id >= 0 && Int(id) < speechVocab else {
+            throw CosyVoice3Error.invalidShape(
+                "speech token id \(id) out of range [0, \(speechVocab))")
+        }
+        let rowStart = Int(id) * embedDim * 4
+        speechBytes.withUnsafeBytes { src in
+            let basePtr = src.baseAddress!.advanced(by: rowStart)
+                .assumingMemoryBound(to: Float.self)
+            if stride == 1 {
+                memcpy(dst, basePtr, embedDim * 4)
+            } else {
+                for i in 0..<embedDim {
+                    dst[i * stride] = basePtr[i]
+                }
+            }
+        }
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextFrontend.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextFrontend.swift
new file mode 100644
index 000000000..04e643e8a
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/CosyVoice3TextFrontend.swift
@@ -0,0 +1,63 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Phase 2 text frontend. Turns raw (prompt_text, tts_text) + prompt speech ids
+/// into the three tensors the LLM-Prefill stage needs:
+///   - `lm_input_embeds` [1, T_pre, 896] fp32
+///   - `t_pre`
+///   - The concatenated text token ids (for Python-side debugging parity).
+///
+/// Mirrors `src/text_frontend.build_frontend_inputs` but only the text path;
+/// CAMPPlus speaker embedding and SpeechTokenizer prompt ids remain
+/// Python-computed and shipped via `CosyVoice3PromptAssets` (see
+/// `CosyVoice3TtsManager` Phase 2 API).
+public final class CosyVoice3TextFrontend {
+
+    public struct Assembled: Sendable {
+        public let lmInputEmbeds: MLMultiArray  // [1, T_pre, 896] fp32
+        public let tPre: Int
+        public let textTokenIds: [Int32]  // prompt + tts concatenated
+    }
+
+    private let tokenizer: Qwen2BpeTokenizer
+    private let embeddings: CosyVoice3TextEmbeddings
+
+    public init(tokenizer: Qwen2BpeTokenizer, embeddings: CosyVoice3TextEmbeddings) {
+        self.tokenizer = tokenizer
+        self.embeddings = embeddings
+    }
+
+    /// Tokenize `prompt_text + tts_text`, look up text embeddings, concatenate
+    /// with sos / task_id / prompt_speech_ids speech embeddings, and return
+    /// the assembled LLM-Prefill input.
+    ///
+    /// - Note: `promptText` MUST contain the `<|endofprompt|>` token
+    ///   (id 151646). The Python pipeline asserts this in
+    ///   `cosyvoice/llm.py:478`.
+    public func assemble(
+        promptText: String,
+        ttsText: String,
+        promptSpeechIds: [Int32]
+    ) throws -> Assembled {
+        let promptIds = tokenizer.encode(promptText)
+        let ttsIds = tokenizer.encode(ttsText)
+        // Python asserts 151646 is present somewhere in the combined token
+        // stream. Enforce here to avoid silent parity breakage.
+        let endOfPrompt: Int32 = 151_646
+        guard promptIds.contains(endOfPrompt) || ttsIds.contains(endOfPrompt) else {
+            throw CosyVoice3Error.invalidShape(
+                "<|endofprompt|> (id 151646) not present in promptText or ttsText")
+        }
+        let combined = promptIds + ttsIds
+
+        let (embeds, tPre) = try embeddings.assembleLmInput(
+            textTokenIds: combined,
+            promptSpeechIds: promptSpeechIds)
+        guard tPre <= CosyVoice3Constants.prefillLength else {
+            throw CosyVoice3Error.invalidShape(
+                "assembled T_pre=\(tPre) exceeds LLM-Prefill length \(CosyVoice3Constants.prefillLength)"
+            )
+        }
+        return Assembled(lmInputEmbeds: embeds, tPre: tPre, textTokenIds: combined)
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift
new file mode 100644
index 000000000..29c39a8e6
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Preprocess/Qwen2BpeTokenizer.swift
@@ -0,0 +1,277 @@
+import Foundation
+
+/// Qwen2 byte-level BPE tokenizer. Mirrors
+/// `transformers.models.qwen2.tokenization_qwen2.Qwen2Tokenizer` on the slow
+/// path used by CosyVoice3 (`AutoTokenizer.from_pretrained(...)` + runtime
+/// `add_special_tokens(...)` as done in `CosyVoice3Tokenizer`).
+///
+/// Encoding pipeline:
+///   1. Split input on registered special tokens (longest-match first). Special
+///      chunks map 1:1 to their fixed ID.
+///   2. Pretokenize non-special chunks with Qwen2's regex.
+///   3. UTF-8 encode each match and remap bytes via the GPT-2 byte→unicode
+///      shim (`ByteEncoder` below).
+///   4. Apply BPE merges (lowest rank wins, all occurrences merged per pass).
+///   5. Look up the resulting symbols in `vocab.json` to get token IDs.
+///
+/// Loader accepts the standard HuggingFace asset layout:
+///   <dir>/vocab.json      — {"symbol": id, ...}
+///   <dir>/merges.txt      — first line is a header or the first merge;
+///                            subsequent lines are "A B" pairs, rank = line idx.
+/// Special tokens are passed in separately (from a JSON map exported alongside
+/// the CosyVoice3 fixtures — the runtime add_special_tokens list in Python is
+/// not encoded in the HF assets).
+public final class Qwen2BpeTokenizer {
+
+    public enum Error: Swift.Error, LocalizedError {
+        case fileNotFound(URL)
+        case invalidJSON(String)
+        case missingField(String)
+        case regexCompileFailed
+
+        public var errorDescription: String? {
+            switch self {
+            case .fileNotFound(let url): return "file not found: \(url.path)"
+            case .invalidJSON(let m): return "invalid JSON: \(m)"
+            case .missingField(let f): return "missing field: \(f)"
+            case .regexCompileFailed: return "failed to compile pretokenize regex"
+            }
+        }
+    }
+
+    /// Qwen2 pretokenize regex (see `transformers` PRETOKENIZE_REGEX).
+    /// Matches: contractions, letter words, single digits, punctuation runs,
+    /// newline-led whitespace, trailing whitespace.
+    public static let pretokenizePattern =
+        #"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"#
+
+    private let vocab: [String: Int32]
+    private let mergeRanks: [String: Int]  // "firstSpace second" -> rank
+    private let specialTokens: [String: Int32]
+    private let specialPattern: NSRegularExpression?
+    private let pretokenizeRegex: NSRegularExpression
+
+    public init(
+        vocab: [String: Int32],
+        merges: [(String, String)],
+        specialTokens: [String: Int32]
+    ) throws {
+        self.vocab = vocab
+        var ranks: [String: Int] = [:]
+        ranks.reserveCapacity(merges.count)
+        for (i, pair) in merges.enumerated() {
+            ranks["\(pair.0) \(pair.1)"] = i
+        }
+        self.mergeRanks = ranks
+        self.specialTokens = specialTokens
+
+        if !specialTokens.isEmpty {
+            // Longest-first so `<|endofprompt|>` wins over `<|end`.
+            let ordered = specialTokens.keys.sorted { $0.count > $1.count }
+            let alternation = ordered.map { NSRegularExpression.escapedPattern(for: $0) }
+                .joined(separator: "|")
+            self.specialPattern = try NSRegularExpression(pattern: alternation)
+        } else {
+            self.specialPattern = nil
+        }
+
+        do {
+            self.pretokenizeRegex = try NSRegularExpression(pattern: Self.pretokenizePattern)
+        } catch {
+            throw Error.regexCompileFailed
+        }
+    }
+
+    /// Load vocab.json + merges.txt from a directory and attach the runtime
+    /// special-token map (must be supplied externally; Python `AutoTokenizer`
+    /// adds these at import time via `add_special_tokens`).
+    public static func load(
+        directory: URL,
+        specialTokens: [String: Int32]
+    ) throws -> Qwen2BpeTokenizer {
+        let vocabURL = directory.appendingPathComponent("vocab.json")
+        let mergesURL = directory.appendingPathComponent("merges.txt")
+        guard FileManager.default.fileExists(atPath: vocabURL.path) else {
+            throw Error.fileNotFound(vocabURL)
+        }
+        guard FileManager.default.fileExists(atPath: mergesURL.path) else {
+            throw Error.fileNotFound(mergesURL)
+        }
+
+        let vocabData = try Data(contentsOf: vocabURL)
+        guard let raw = try JSONSerialization.jsonObject(with: vocabData) as? [String: Int] else {
+            throw Error.invalidJSON("vocab.json is not {String: Int}")
+        }
+        var vocab: [String: Int32] = [:]
+        vocab.reserveCapacity(raw.count)
+        for (k, v) in raw { vocab[k] = Int32(v) }
+
+        let mergesText = try String(contentsOf: mergesURL, encoding: .utf8)
+        var merges: [(String, String)] = []
+        merges.reserveCapacity(140_000)
+        var isFirst = true
+        for line in mergesText.split(separator: "\n", omittingEmptySubsequences: true) {
+            if isFirst {
+                isFirst = false
+                // Typical merges.txt header: "#version: 0.2". Skip it.
+                if line.hasPrefix("#") { continue }
+            }
+            let parts = line.split(separator: " ", maxSplits: 1)
+            guard parts.count == 2 else { continue }
+            merges.append((String(parts[0]), String(parts[1])))
+        }
+
+        return try Qwen2BpeTokenizer(vocab: vocab, merges: merges, specialTokens: specialTokens)
+    }
+
+    /// Encode text to token IDs.
+    public func encode(_ text: String) -> [Int32] {
+        var out: [Int32] = []
+        splitBySpecial(text) { chunk, isSpecial in
+            if isSpecial {
+                if let id = specialTokens[chunk] { out.append(id) }
+                return
+            }
+            pretokenize(chunk) { piece in
+                let mapped = ByteEncoder.encode(piece.utf8)
+                let bpeTokens = bpe(mapped)
+                for tok in bpeTokens {
+                    if let id = vocab[tok] {
+                        out.append(id)
+                    } else if let id = specialTokens[tok] {
+                        out.append(id)
+                    }
+                    // Unknown token: Qwen2 has no <unk>. Drop silently as
+                    // upstream never produces one for valid UTF-8 input.
+                }
+            }
+        }
+        return out
+    }
+
+    // MARK: - Special token split
+
+    private func splitBySpecial(_ text: String, _ handle: (String, Bool) -> Void) {
+        guard let regex = specialPattern, !text.isEmpty else {
+            if !text.isEmpty { handle(text, false) }
+            return
+        }
+        let ns = text as NSString
+        let range = NSRange(location: 0, length: ns.length)
+        var cursor = 0
+        regex.enumerateMatches(in: text, options: [], range: range) { match, _, _ in
+            guard let m = match else { return }
+            if m.range.location > cursor {
+                let sub = ns.substring(with: NSRange(location: cursor, length: m.range.location - cursor))
+                if !sub.isEmpty { handle(sub, false) }
+            }
+            handle(ns.substring(with: m.range), true)
+            cursor = m.range.location + m.range.length
+        }
+        if cursor < ns.length {
+            let sub = ns.substring(with: NSRange(location: cursor, length: ns.length - cursor))
+            if !sub.isEmpty { handle(sub, false) }
+        }
+    }
+
+    // MARK: - Pretokenize
+
+    private func pretokenize(_ text: String, _ handle: (String) -> Void) {
+        guard !text.isEmpty else { return }
+        let ns = text as NSString
+        let range = NSRange(location: 0, length: ns.length)
+        pretokenizeRegex.enumerateMatches(in: text, options: [], range: range) { match, _, _ in
+            guard let m = match else { return }
+            if m.range.length > 0 {
+                handle(ns.substring(with: m.range))
+            }
+        }
+    }
+
+    // MARK: - BPE
+
+    /// Standard GPT-2 BPE: repeatedly merge the lowest-rank adjacent pair
+    /// until no pair is mergeable, then return the final symbol list.
+    private func bpe(_ text: String) -> [String] {
+        if text.isEmpty { return [] }
+        var symbols = text.map { String($0) }
+        if symbols.count < 2 { return symbols }
+
+        while true {
+            var bestRank = Int.max
+            var bestIndex = -1
+            for i in 0..<(symbols.count - 1) {
+                let key = "\(symbols[i]) \(symbols[i + 1])"
+                if let r = mergeRanks[key], r < bestRank {
+                    bestRank = r
+                    bestIndex = i
+                }
+            }
+            if bestIndex < 0 { break }
+
+            let first = symbols[bestIndex]
+            let second = symbols[bestIndex + 1]
+            var merged: [String] = []
+            merged.reserveCapacity(symbols.count - 1)
+            var i = 0
+            while i < symbols.count {
+                if i < symbols.count - 1 && symbols[i] == first && symbols[i + 1] == second {
+                    merged.append(first + second)
+                    i += 2
+                } else {
+                    merged.append(symbols[i])
+                    i += 1
+                }
+            }
+            symbols = merged
+            if symbols.count < 2 { break }
+        }
+        return symbols
+    }
+
+    // MARK: - Byte encoder
+
+    /// GPT-2 style reversible byte→unicode mapping used by Qwen2 BPE.
+    ///
+    /// Mirrors `transformers.models.qwen2.tokenization_qwen2.bytes_to_unicode`:
+    /// - Printable ASCII, Latin-1 supplement (¡..¬), and (®..ÿ) map to themselves.
+    /// - The 68 "unprintable" bytes are remapped to code points 256..323.
+    ///
+    /// After mapping, every byte of a UTF-8 string becomes a single-code-point
+    /// unicode character that vocab/merges.txt expect.
+    fileprivate enum ByteEncoder {
+
+        /// byte (0..255) → single Unicode scalar.
+        static let byteToUnicode: [Character] = {
+            var map = [Character](repeating: Character(" "), count: 256)
+            var printable = [Int]()
+            printable.reserveCapacity(188)
+            printable.append(contentsOf: Int(Character("!").asciiValue!)...Int(Character("~").asciiValue!))
+            printable.append(contentsOf: 0xA1...0xAC)
+            printable.append(contentsOf: 0xAE...0xFF)
+
+            for b in printable {
+                map[b] = Character(UnicodeScalar(b)!)
+            }
+
+            var extra = 0
+            for b in 0..<256 {
+                if !printable.contains(b) {
+                    let scalar = UnicodeScalar(256 + extra)!
+                    map[b] = Character(scalar)
+                    extra += 1
+                }
+            }
+            return map
+        }()
+
+        /// Encode a UTF-8 byte sequence as a string of mapped characters.
+        static func encode(_ bytes: some Sequence<UInt8>) -> String {
+            var out = ""
+            for b in bytes {
+                out.append(byteToUnicode[Int(b)])
+            }
+            return out
+        }
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift
new file mode 100644
index 000000000..f4cd579c0
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3RasSampler.swift
@@ -0,0 +1,175 @@
+import Foundation
+
+/// RAS (Repetition-Aware Sampling) — top-p nucleus sampling with a repetition
+/// mask that re-samples if a token fires too often in the recent window.
+///
+/// Mirrors `ras_sampling` in
+/// `mobius/.../verify/test_coreml_e2e_fp16.py`:
+///   1. softmax(logp) → stable-sort desc → pick up to `topK` ids until
+///      cumulative mass ≥ `topP`
+///   2. multinomial draw within that candidate set
+///   3. if the drawn id appears in the last `winSize` decoded tokens at least
+///      `winSize * tauR` times, mask it to -inf and re-sample across the full
+///      vocab
+///
+/// A `seedTokens` mode bypasses the RNG entirely — the sampler just emits the
+/// pre-recorded Python token stream one id at a time. This is how the parity
+/// harness bit-matches despite the `torch.multinomial` RNG mismatch between
+/// PyTorch and Swift.
+public final class CosyVoice3RasSampler {
+
+    public let topP: Float
+    public let topK: Int
+    public let winSize: Int
+    public let tauR: Float
+    public let vocabSize: Int
+
+    private var rng: SeedableRng
+    private var seedQueue: [Int32]
+    private var seedIdx: Int = 0
+
+    public init(
+        topP: Float = CosyVoice3Constants.topP,
+        topK: Int = CosyVoice3Constants.topK,
+        winSize: Int = CosyVoice3Constants.rasWindow,
+        tauR: Float = CosyVoice3Constants.rasTauR,
+        vocabSize: Int = CosyVoice3Constants.speechVocab,
+        seed: UInt64 = 42
+    ) {
+        self.topP = topP
+        self.topK = topK
+        self.winSize = winSize
+        self.tauR = tauR
+        self.vocabSize = vocabSize
+        self.rng = SeedableRng(seed: seed)
+        self.seedQueue = []
+    }
+
+    /// Pre-load a token stream to replay (for parity harness).
+    public func seedTokens(_ tokens: [Int32]) {
+        self.seedQueue = tokens
+        self.seedIdx = 0
+    }
+
+    /// Given `logits` of shape `[vocabSize]`, return the sampled token id.
+    /// `decodedSoFar` is the running decoded stream for repetition checking.
+    public func sample(logits: [Float], decodedSoFar: [Int32]) -> Int32 {
+        // Seeded parity replay bypasses sampling.
+        if seedIdx < seedQueue.count {
+            let id = seedQueue[seedIdx]
+            seedIdx += 1
+            return id
+        }
+        precondition(logits.count == vocabSize, "logits count must match vocabSize")
+
+        // Pass 1: nucleus sampling.
+        let probs = logits.softmax()
+        let top = nucleus(probs: probs)
+        var sampled = top
+
+        // Pass 2: repetition mask.
+        let windowStart = max(0, decodedSoFar.count - winSize)
+        let recent = decodedSoFar[windowStart..<decodedSoFar.count]
+        let rep = recent.filter { $0 == sampled }.count
+        if Float(rep) >= Float(winSize) * tauR {
+            var masked = probs
+            masked[Int(sampled)] = 0
+            // Re-normalize + multinomial across full vocab.
+            let sum = masked.reduce(0, +)
+            if sum > 0 {
+                for i in 0..<masked.count { masked[i] /= sum }
+            }
+            sampled = multinomial(probs: masked)
+        }
+        return sampled
+    }
+
+    // MARK: - Nucleus helper
+
+    private func nucleus(probs: [Float]) -> Int32 {
+        // Stable sort descending with index.
+        let sorted = probs.enumerated().sorted {
+            if $0.element != $1.element { return $0.element > $1.element }
+            return $0.offset < $1.offset
+        }
+        var cum: Float = 0
+        var selIdx: [Int] = []
+        var selProb: [Float] = []
+        for entry in sorted {
+            if cum < topP && selProb.count < topK {
+                cum += entry.element
+                selProb.append(entry.element)
+                selIdx.append(entry.offset)
+            } else {
+                break
+            }
+        }
+        // Normalize selected candidates and multinomial pick.
+        let sum = selProb.reduce(0, +)
+        guard sum > 0 else { return Int32(selIdx.first ?? 0) }
+        for i in 0..<selProb.count { selProb[i] /= sum }
+        let picked = multinomialInSet(probs: selProb, ids: selIdx)
+        return Int32(picked)
+    }
+
+    private func multinomial(probs: [Float]) -> Int32 {
+        let u = rng.nextFloat()
+        var cum: Float = 0
+        for (i, p) in probs.enumerated() {
+            cum += p
+            if u < cum { return Int32(i) }
+        }
+        return Int32(probs.count - 1)
+    }
+
+    private func multinomialInSet(probs: [Float], ids: [Int]) -> Int {
+        let u = rng.nextFloat()
+        var cum: Float = 0
+        for (j, p) in probs.enumerated() {
+            cum += p
+            if u < cum { return ids[j] }
+        }
+        return ids.last ?? 0
+    }
+}
+
+// MARK: - Simple deterministic RNG
+
+/// Linear-congruential PRNG wrapping SplitMix64. Used only as a fallback when
+/// parity replay isn't active; the parity harness seeds an explicit token list
+/// to dodge `torch.multinomial` divergence.
+private struct SeedableRng {
+    private var state: UInt64
+    init(seed: UInt64) { self.state = seed == 0 ? 0xdead_beef : seed }
+    mutating func nextUInt64() -> UInt64 {
+        state &+= 0x9E37_79B9_7F4A_7C15
+        var z = state
+        z = (z ^ (z >> 30)) &* 0xBF58_476D_1CE4_E5B9
+        z = (z ^ (z >> 27)) &* 0x94D0_49BB_1331_11EB
+        return z ^ (z >> 31)
+    }
+    mutating func nextFloat() -> Float {
+        // 24-bit mantissa → [0, 1)
+        let bits = UInt32(truncatingIfNeeded: nextUInt64() >> 40)
+        return Float(bits) / Float(1 << 24)
+    }
+}
+
+// MARK: - Array softmax
+
+extension Array where Element == Float {
+    fileprivate func softmax() -> [Float] {
+        guard let m = self.max() else { return self }
+        var exps = [Float](repeating: 0, count: self.count)
+        var sum: Float = 0
+        for i in 0..<self.count {
+            let e = expf(self[i] - m)
+            exps[i] = e
+            sum += e
+        }
+        if sum > 0 {
+            for i in 0..<exps.count { exps[i] /= sum }
+        }
+        return exps
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift
new file mode 100644
index 000000000..913ed295a
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3SpeechEmbeddings.swift
@@ -0,0 +1,72 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// mmap'd [6761, 896] fp16 speech-embedding lookup table.
+///
+/// Python-side, this table is `self.llm.speech_embedding.weight` fetched per
+/// decoded token id and fed into the LLM decode step as `inputs_embeds`.
+/// Swift-side we mmap the exported safetensors and convert one row from fp16
+/// to fp32 per decode step into a freshly allocated `[1, 1, 896]` fp32
+/// MLMultiArray (the decode mlpackage declares fp32 at its I/O boundary).
+public final class CosyVoice3SpeechEmbeddings {
+
+    private let file: SafetensorsFile
+    private let tableBytes: Data
+    private let rowByteSize: Int
+    public let numTokens: Int
+    public let embedDim: Int
+
+    public init(url: URL) throws {
+        let file = try SafetensorsFile(url: url)
+        guard let info = file.tensors["speech_embedding"] else {
+            throw CosyVoice3Error.embeddingTableMissing("speech_embedding")
+        }
+        guard info.dtype == .f16, info.shape.count == 2 else {
+            throw CosyVoice3Error.invalidShape(
+                "speech_embedding expects [vocab, 896] fp16, got shape=\(info.shape) dtype=\(info.dtype.rawValue)"
+            )
+        }
+        self.file = file
+        self.tableBytes = try file.rawBytes("speech_embedding")
+        self.numTokens = info.shape[0]
+        self.embedDim = info.shape[1]
+        self.rowByteSize = info.shape[1] * 2  // fp16
+        guard self.embedDim == CosyVoice3Constants.embedDim else {
+            throw CosyVoice3Error.invalidShape(
+                "speech_embedding dim=\(embedDim) does not match CosyVoice3 embedDim=\(CosyVoice3Constants.embedDim)"
+            )
+        }
+    }
+
+    /// Returns a `[1, 1, 896]` fp32 MLMultiArray containing the embedding row
+    /// for `tokenId`, converted from fp16. Allocates fresh each call — the
+    /// LLM decode step owns the tensor for exactly one prediction.
+    public func embedding(tokenId: Int32) throws -> MLMultiArray {
+        let array = try MLMultiArray(
+            shape: [1, 1, NSNumber(value: embedDim)],
+            dataType: .float32)
+        try copyEmbedding(tokenId: tokenId, into: array)
+        return array
+    }
+
+    /// Copy the fp16 embedding row for `tokenId` into an existing
+    /// `[1, 1, embedDim]` fp32 MLMultiArray. Avoids the per-step allocation
+    /// of `embedding(tokenId:)` in the hot decode loop.
+    public func copyEmbedding(tokenId: Int32, into array: MLMultiArray) throws {
+        guard tokenId >= 0 && Int(tokenId) < numTokens else {
+            throw CosyVoice3Error.invalidShape(
+                "speech token id \(tokenId) out of range [0, \(numTokens))")
+        }
+        let rowStart = Int(tokenId) * rowByteSize
+        let dim = embedDim
+        let lastStride = array.strides.last?.intValue ?? 1
+        tableBytes.withUnsafeBytes { src in
+            let basePtr = src.baseAddress!.advanced(by: rowStart)
+            let fp16Ptr = basePtr.assumingMemoryBound(to: Float16.self)
+            let dstPtr = array.dataPointer.bindMemory(to: Float.self, capacity: array.count)
+            for i in 0..<dim {
+                dstPtr[i * lastStride] = Float(fp16Ptr[i])
+            }
+        }
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Synthesizer.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Synthesizer.swift
new file mode 100644
index 000000000..83a00b370
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Synthesizer.swift
@@ -0,0 +1,525 @@
+@preconcurrency import CoreML
+import Foundation
+
+/// Top-level synthesizer orchestrating prefill → decode loop → Flow → HiFT.
+///
+/// Mirrors `verify/test_coreml_e2e_fp16.py::main()` in Python. Each stage is
+/// implemented as a method on this type, keeping the state (KV cache, running
+/// decoded list) local to a single synthesis call.
+///
+/// Decode uses CoreML `MLState` (macOS 15 / iOS 18): 48 per-layer buffers
+/// (`kv_k_0..kv_k_23`, `kv_v_0..kv_v_23`) replace the 18 MB kv_k / kv_v
+/// round-trip per step. Prefill remains non-stateful and its `kv_k` / `kv_v`
+/// outputs seed the decode state once after prefill.
+@available(macOS 15, iOS 18, *)
+public actor CosyVoice3Synthesizer {
+
+    private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "CosyVoice3Synthesizer")
+
+    private let models: CosyVoice3Models
+    private let embeddings: CosyVoice3SpeechEmbeddings
+
+    public init(models: CosyVoice3Models, embeddings: CosyVoice3SpeechEmbeddings) {
+        self.models = models
+        self.embeddings = embeddings
+    }
+
+    /// Entry point for the Phase 1 parity harness.
+    public func synthesize(
+        fixture: CosyVoice3FrontendFixture,
+        options: CosyVoice3ParityOptions
+    ) async throws -> CosyVoice3SynthesisResult {
+
+        let nPrompt = fixture.promptSpeechIds.count
+        let roomForNew = CosyVoice3Constants.flowTotalTokens - nPrompt
+        guard roomForNew > 0 else {
+            throw CosyVoice3Error.sequenceTooLong(nPrompt)
+        }
+        let maxNew: Int = {
+            if let cap = options.maxNewTokens, cap > 0 { return min(cap, roomForNew) }
+            return roomForNew
+        }()
+
+        // Sampler. Parity harness seeds the Python-recorded decode stream.
+        let sampler = CosyVoice3RasSampler(seed: options.seed)
+        if options.replayDecodedTokens {
+            sampler.seedTokens(fixture.decodedTokens)
+        }
+
+        // 1) Prefill (non-stateful: returns kv_k / kv_v as outputs)
+        let tPrefill = Date()
+        let (prefillLogits, initialKvK, initialKvV) = try await runPrefill(fixture: fixture)
+        let prefillSec = Date().timeIntervalSince(tPrefill)
+
+        // Seed decode MLState from prefill kv_k / kv_v.
+        let tSeed = Date()
+        let state = models.decode.makeState()
+        try seedDecodeState(state: state, kvK: initialKvK, kvV: initialKvV)
+        let seedSec = Date().timeIntervalSince(tSeed)
+
+        // Reusable per-step inputs for decode. `curLenArr` is mutated in place
+        // each step; `inputsEmbedsArr` is overwritten by memcpy per step.
+        let curLenArr = try MLMultiArray(shape: [1], dataType: .int32)
+        let inputsEmbedsArr = try MLMultiArray(
+            shape: [1, 1, NSNumber(value: CosyVoice3Constants.embedDim)],
+            dataType: .float32)
+
+        // First token from prefill tail logits.
+        var decoded: [Int32] = []
+        let firstLogits = sliceLastStepLogits(
+            from: prefillLogits,
+            tPre: fixture.tPre,
+            vocab: CosyVoice3Constants.speechVocab)
+        var topId = sampler.sample(logits: firstLogits, decodedSoFar: decoded)
+        if CosyVoice3Constants.stopRange.contains(topId) {
+            // Prefill emitted EOS at step 0 — the LLM signaled "no speech".
+            // Bail out instead of feeding the stop-token embedding into the
+            // decode loop (which would accumulate semantically meaningless
+            // tokens into `decoded`).
+            logger.info("First token \(topId) is a stop token; no speech generated")
+            throw CosyVoice3Error.predictionFailed("LLM produced no speech tokens")
+        }
+        decoded.append(topId)
+
+        // 2) Decode loop
+        var curLen = fixture.tPre
+        var decodeSteps = 0
+        let tDecode = Date()
+        for step in 1..<maxNew {
+            try embeddings.copyEmbedding(tokenId: topId, into: inputsEmbedsArr)
+            curLenArr[0] = NSNumber(value: Int32(curLen))
+            let logits = try runDecodeStateful(
+                inputsEmbeds: inputsEmbedsArr,
+                curLen: curLenArr,
+                state: state)
+            topId = sampler.sample(logits: logits, decodedSoFar: decoded)
+            curLen += 1
+            decodeSteps += 1
+            if CosyVoice3Constants.stopRange.contains(topId) {
+                logger.info("EOS at step \(step) (token=\(topId))")
+                break
+            }
+            decoded.append(topId)
+        }
+        let decodeSec = Date().timeIntervalSince(tDecode)
+        guard !decoded.isEmpty else {
+            throw CosyVoice3Error.predictionFailed("LLM produced no speech tokens")
+        }
+
+        // 3) Flow
+        let nNew = decoded.count
+        let tFlow = Date()
+        let mel = try await runFlow(
+            promptSpeechIds: fixture.promptSpeechIds,
+            decodedTokens: decoded,
+            promptMel: fixture.promptMel,
+            promptMelFrames: fixture.promptMelFrames,
+            spkEmbedding: fixture.spkEmbedding)
+        let flowSec = Date().timeIntervalSince(tFlow)
+
+        // 4) Slice mel to new portion + HiFT
+        let numPromptMel = mel.numPromptMel
+        let newMelStart = numPromptMel
+        let newMelFrames = nNew * CosyVoice3Constants.tokenMelRatio
+        let tHift = Date()
+        let audio = try await runHiFT(
+            fullMel: mel.mel,
+            newMelStart: newMelStart,
+            newMelFrames: newMelFrames)
+        let hiftSec = Date().timeIntervalSince(tHift)
+
+        // Emit stage timings via the shared logger for RTFx benchmarking.
+        let decodeTps = decodeSteps > 0 ? Double(decodeSteps) / decodeSec : 0
+        logger.info(
+            String(
+                format:
+                    "STAGES prefill=%.3fs seed=%.3fs decode=%.3fs(%d steps, %.2f tok/s) flow=%.3fs hift=%.3fs",
+                prefillSec, seedSec, decodeSec, decodeSteps, decodeTps, flowSec, hiftSec))
+
+        return CosyVoice3SynthesisResult(
+            samples: audio,
+            sampleRate: CosyVoice3Constants.sampleRate,
+            generatedTokenCount: nNew,
+            decodedTokens: decoded)
+    }
+
+    // MARK: - Stages
+
+    private func runPrefill(
+        fixture: CosyVoice3FrontendFixture
+    ) async throws -> (logits: MLMultiArray, kvK: MLMultiArray, kvV: MLMultiArray) {
+        guard fixture.tPre <= CosyVoice3Constants.prefillLength else {
+            throw CosyVoice3Error.prefillTooLong(fixture.tPre)
+        }
+        // Pad lm_input_embeds from [1, tPre, 896] to [1, 256, 896].
+        // Strides may be non-compact (e.g. [T*D_padded, D_padded, 1]).
+        let embeds = try MLMultiArray(
+            shape: [
+                1,
+                NSNumber(value: CosyVoice3Constants.prefillLength),
+                NSNumber(value: CosyVoice3Constants.embedDim),
+            ],
+            dataType: .float32)
+        let embedDim = CosyVoice3Constants.embedDim
+        let embedsStrides = embeds.strides.map { $0.intValue }
+        let dst = embeds.dataPointer.bindMemory(to: Float.self, capacity: embeds.count)
+        let physicalCount = embedsStrides[0] * embeds.shape[0].intValue
+        dst.initialize(repeating: 0, count: physicalCount)
+        for t in 0..<fixture.tPre {
+            for d in 0..<embedDim {
+                let srcIdx = t * embedDim + d
+                let dstOff = t * embedsStrides[1] + d * embedsStrides[2]
+                dst[dstOff] = fixture.lmInputEmbeds[srcIdx]
+            }
+        }
+        let inputLen = try MLMultiArray(shape: [1], dataType: .int32)
+        inputLen[0] = NSNumber(value: Int32(fixture.tPre))
+
+        let features: [String: Any] = [
+            "inputs_embeds": embeds,
+            "input_len": inputLen,
+        ]
+        let provider = try MLDictionaryFeatureProvider(dictionary: features)
+        let output = try await models.prefill.compatPrediction(
+            from: provider, options: MLPredictionOptions())
+
+        guard
+            let logits = output.featureValue(for: "speech_logits")?.multiArrayValue,
+            let kvK = output.featureValue(for: "kv_k")?.multiArrayValue,
+            let kvV = output.featureValue(for: "kv_v")?.multiArrayValue
+        else {
+            throw CosyVoice3Error.predictionFailed("prefill: missing outputs")
+        }
+        return (logits, kvK, kvV)
+    }
+
+    /// Run one stateful decode step. `state` is mutated in place via the
+    /// 48 per-layer `kv_k_i` / `kv_v_i` state buffers registered in the
+    /// converted model.
+    private func runDecodeStateful(
+        inputsEmbeds: MLMultiArray,
+        curLen: MLMultiArray,
+        state: MLState
+    ) throws -> [Float] {
+        let features: [String: Any] = [
+            "inputs_embeds": inputsEmbeds,
+            "cur_len": curLen,
+        ]
+        let provider = try MLDictionaryFeatureProvider(dictionary: features)
+        let output = try models.decode.prediction(from: provider, using: state)
+
+        guard
+            let logitsArr = output.featureValue(for: "speech_logits")?.multiArrayValue
+        else {
+            throw CosyVoice3Error.predictionFailed("decode: missing speech_logits")
+        }
+        // logits shape = [1, 1, 6761] fp32; strides may be non-compact.
+        let count = CosyVoice3Constants.speechVocab
+        var logits = [Float](repeating: 0, count: count)
+        let strides = logitsArr.strides.map { $0.intValue }
+        let vocabStride = strides.last ?? 1
+        let base = logitsArr.dataPointer.bindMemory(to: Float.self, capacity: logitsArr.count)
+        for i in 0..<count { logits[i] = base[i * vocabStride] }
+        return logits
+    }
+
+    /// Seed the 48 decode state buffers (`kv_k_0..kv_k_23`, `kv_v_0..kv_v_23`)
+    /// from prefill's `kv_k` / `kv_v` outputs.
+    ///
+    /// Prefill logical shape per cache is `[L=24, 1, Hkv=2, M=768, D=64]`
+    /// fp16; each per-layer state buffer is `[1, 2, 768, 64]` fp16. Copy
+    /// layer-by-layer using stride-aware indexing (prefill strides may not
+    /// be compact), letting CoreML's state writer convert to the underlying
+    /// fp16 storage.
+    private func seedDecodeState(
+        state: MLState,
+        kvK: MLMultiArray,
+        kvV: MLMultiArray
+    ) throws {
+        // Prefill declares fp32 KV outputs at its CoreML I/O boundary
+        // (even though the weights / activations internally are fp16).
+        // Decode state buffers are fp16. Convert per-element as we copy.
+        guard kvK.dataType == .float32 && kvV.dataType == .float32 else {
+            throw CosyVoice3Error.predictionFailed(
+                "seedDecodeState: expected fp32 KV from prefill (kv_k=\(kvK.dataType.rawValue) kv_v=\(kvV.dataType.rawValue))"
+            )
+        }
+
+        let L = CosyVoice3Constants.numLayers
+        let H = CosyVoice3Constants.kvHeads
+        let M = CosyVoice3Constants.kvMaxLength
+        let D = CosyVoice3Constants.headDim
+
+        // Prefill output strides for shape [L, 1, H, M, D].
+        let kStrides = kvK.strides.map { $0.intValue }
+        let vStrides = kvV.strides.map { $0.intValue }
+        let kLayerStride = kStrides[0]
+        let kHStride = kStrides[2]
+        let kMStride = kStrides[3]
+        let kDStride = kStrides[4]
+        let vLayerStride = vStrides[0]
+        let vHStride = vStrides[2]
+        let vMStride = vStrides[3]
+        let vDStride = vStrides[4]
+
+        let kSrcPtr = kvK.dataPointer.bindMemory(to: Float.self, capacity: kvK.count)
+        let vSrcPtr = kvV.dataPointer.bindMemory(to: Float.self, capacity: kvV.count)
+
+        // Collect dtype-mismatch errors from inside the non-throwing closures.
+        var stateDtypeError: String?
+
+        for i in 0..<L {
+            state.withMultiArray(for: "kv_k_\(i)") { buf in
+                guard buf.dataType == .float16 else {
+                    if stateDtypeError == nil {
+                        stateDtypeError = "kv_k_\(i) expected fp16 state, got \(buf.dataType.rawValue)"
+                    }
+                    return
+                }
+                let b = buf.strides.map { $0.intValue }
+                let dPtr = buf.dataPointer.bindMemory(to: Float16.self, capacity: buf.count)
+                Self.copyLayerF32ToF16(
+                    src: kSrcPtr, srcLayerBase: i * kLayerStride,
+                    srcHStride: kHStride, srcMStride: kMStride, srcDStride: kDStride,
+                    dst: dPtr,
+                    dstHStride: b[1], dstMStride: b[2], dstDStride: b[3],
+                    H: H, M: M, D: D)
+            }
+            state.withMultiArray(for: "kv_v_\(i)") { buf in
+                guard buf.dataType == .float16 else {
+                    if stateDtypeError == nil {
+                        stateDtypeError = "kv_v_\(i) expected fp16 state, got \(buf.dataType.rawValue)"
+                    }
+                    return
+                }
+                let b = buf.strides.map { $0.intValue }
+                let dPtr = buf.dataPointer.bindMemory(to: Float16.self, capacity: buf.count)
+                Self.copyLayerF32ToF16(
+                    src: vSrcPtr, srcLayerBase: i * vLayerStride,
+                    srcHStride: vHStride, srcMStride: vMStride, srcDStride: vDStride,
+                    dst: dPtr,
+                    dstHStride: b[1], dstMStride: b[2], dstDStride: b[3],
+                    H: H, M: M, D: D)
+            }
+        }
+
+        if let msg = stateDtypeError {
+            throw CosyVoice3Error.predictionFailed("seedDecodeState: \(msg)")
+        }
+    }
+
+    /// Copy one `[H, M, D]` KV slab from a fp32 prefill output into a fp16
+    /// decode state buffer. Strides may be non-compact on either side.
+    private static func copyLayerF32ToF16(
+        src: UnsafeMutablePointer<Float>,
+        srcLayerBase: Int,
+        srcHStride: Int, srcMStride: Int, srcDStride: Int,
+        dst: UnsafeMutablePointer<Float16>,
+        dstHStride: Int, dstMStride: Int, dstDStride: Int,
+        H: Int, M: Int, D: Int
+    ) {
+        for h in 0..<H {
+            for m in 0..<M {
+                for d in 0..<D {
+                    let sOff = srcLayerBase + h * srcHStride + m * srcMStride + d * srcDStride
+                    let dOff = h * dstHStride + m * dstMStride + d * dstDStride
+                    dst[dOff] = Float16(src[sOff])
+                }
+            }
+        }
+    }
+
+    private func runFlow(
+        promptSpeechIds: [Int32],
+        decodedTokens: [Int32],
+        promptMel: [Float],
+        promptMelFrames: Int,
+        spkEmbedding: [Float]
+    ) async throws -> (mel: MLMultiArray, numPromptMel: Int) {
+        let N = CosyVoice3Constants.flowTotalTokens
+        let nPrompt = promptSpeechIds.count
+        let nNew = decodedTokens.count
+        let nTotal = nPrompt + nNew
+        guard nTotal <= N else {
+            throw CosyVoice3Error.sequenceTooLong(nTotal)
+        }
+        // token_total: [1, 250] int32, zero-padded. Respect strides.
+        let tokenTotal = try MLMultiArray(
+            shape: [1, NSNumber(value: N)],
+            dataType: .int32)
+        let ttStrides = tokenTotal.strides.map { $0.intValue }
+        let ttPtr = tokenTotal.dataPointer.bindMemory(to: Int32.self, capacity: tokenTotal.count)
+        let ttPhysical = ttStrides[0] * tokenTotal.shape[0].intValue
+        ttPtr.initialize(repeating: 0, count: ttPhysical)
+        for i in 0..<nPrompt { ttPtr[i * ttStrides[1]] = promptSpeechIds[i] }
+        for i in 0..<nNew { ttPtr[(nPrompt + i) * ttStrides[1]] = decodedTokens[i] }
+
+        // num_prompt_tokens: [1] int32
+        let numPromptTokens = try MLMultiArray(shape: [1], dataType: .int32)
+        numPromptTokens[0] = NSNumber(value: Int32(nPrompt))
+
+        // prompt_feat: [1, 500, 80] fp32, zero-padded along axis 1. Respect strides.
+        let hiftFrames = CosyVoice3Constants.hiftMaxFrames
+        let melBins = CosyVoice3Constants.melBins
+        let promptFeat = try MLMultiArray(
+            shape: [
+                1, NSNumber(value: hiftFrames), NSNumber(value: melBins),
+            ],
+            dataType: .float32)
+        let pfStrides = promptFeat.strides.map { $0.intValue }
+        let pfPtr = promptFeat.dataPointer.bindMemory(to: Float.self, capacity: promptFeat.count)
+        let pfPhysical = pfStrides[0] * promptFeat.shape[0].intValue
+        pfPtr.initialize(repeating: 0, count: pfPhysical)
+        let copyFrames = min(promptMelFrames, hiftFrames)
+        for f in 0..<copyFrames {
+            for b in 0..<melBins {
+                let srcIdx = f * melBins + b
+                let dstOff = f * pfStrides[1] + b * pfStrides[2]
+                pfPtr[dstOff] = promptMel[srcIdx]
+            }
+        }
+
+        // embedding: [1, 192] fp32. Respect strides.
+        let embedding = try MLMultiArray(
+            shape: [1, NSNumber(value: CosyVoice3Constants.speakerEmbeddingDim)],
+            dataType: .float32)
+        let eStrides = embedding.strides.map { $0.intValue }
+        let ePtr = embedding.dataPointer.bindMemory(to: Float.self, capacity: embedding.count)
+        let ePhysical = eStrides[0] * embedding.shape[0].intValue
+        ePtr.initialize(repeating: 0, count: ePhysical)
+        for i in 0..<spkEmbedding.count { ePtr[i * eStrides[1]] = spkEmbedding[i] }
+
+        let features: [String: Any] = [
+            "token_total": tokenTotal,
+            "num_prompt_tokens": numPromptTokens,
+            "prompt_feat": promptFeat,
+            "embedding": embedding,
+        ]
+        let provider = try MLDictionaryFeatureProvider(dictionary: features)
+        let output = try await models.flow.compatPrediction(
+            from: provider, options: MLPredictionOptions())
+
+        guard
+            let mel = output.featureValue(for: "mel")?.multiArrayValue,
+            let nPromptMelArr = output.featureValue(for: "num_prompt_mel")?.multiArrayValue
+        else {
+            throw CosyVoice3Error.predictionFailed("flow: missing outputs")
+        }
+        let nPromptMel = nPromptMelArr[0].intValue
+        return (mel, nPromptMel)
+    }
+
+    private func runHiFT(
+        fullMel: MLMultiArray,
+        newMelStart: Int,
+        newMelFrames: Int
+    ) async throws -> [Float] {
+        // fullMel logical shape = [1, 80, 500]. Physical strides may be
+        // non-compact (e.g. [40960, 512, 1]) — use logical indexing.
+        // Dtype depends on the Flow variant: the ANE-port Flow emits fp16 to
+        // keep the graph fp16 end-to-end; the prior cpuAndGPU Flow emits fp32.
+        // HiFT's `mel` input is always fp32 at the CoreML I/O boundary.
+        let hiftFrames = CosyVoice3Constants.hiftMaxFrames
+        let melBins = CosyVoice3Constants.melBins
+        // fullMel logical shape = [1, 80, totalMelFrames]. Clamp the valid
+        // window to the remaining frames after `newMelStart` so a slightly
+        // off `num_prompt_mel` from the Flow model can never cause an
+        // out-of-bounds read at `srcBase[newMelStart + f]`.
+        let totalMelFrames = fullMel.shape.count >= 3 ? fullMel.shape[2].intValue : hiftFrames
+        guard newMelStart >= 0 && newMelStart <= totalMelFrames else {
+            throw CosyVoice3Error.invalidShape(
+                "runHiFT: newMelStart=\(newMelStart) out of range [0, \(totalMelFrames)]")
+        }
+        let availableFrames = max(0, totalMelFrames - newMelStart)
+        let validFrames = min(newMelFrames, hiftFrames, availableFrames)
+
+        let melInput = try MLMultiArray(
+            shape: [1, NSNumber(value: melBins), NSNumber(value: hiftFrames)],
+            dataType: .float32)
+        // melInput strides may also be non-compact — use logical indexing.
+        let melInputStrides = melInput.strides.map { $0.intValue }
+        let dstBase = melInput.dataPointer.bindMemory(to: Float.self, capacity: melInput.count)
+        // Zero-fill entire physical extent (handles padded strides).
+        let totalPhysical = melInputStrides[0] * melInput.shape[0].intValue
+        dstBase.initialize(repeating: 0, count: totalPhysical)
+
+        let srcStrides = fullMel.strides.map { $0.intValue }
+        // fullMel logical: [1, 80, 500]; copy new slice → melInput [1, 80, 500].
+        // Branch on src dtype so the fp16 ANE-port Flow output doesn't get
+        // reinterpreted as fp32 (would read past end of buffer → SIGSEGV).
+        switch fullMel.dataType {
+        case .float16:
+            let srcBase = fullMel.dataPointer.bindMemory(
+                to: Float16.self, capacity: fullMel.count)
+            for b in 0..<melBins {
+                for f in 0..<validFrames {
+                    let srcOff = b * srcStrides[1] + (newMelStart + f) * srcStrides[2]
+                    let dstOff = b * melInputStrides[1] + f * melInputStrides[2]
+                    dstBase[dstOff] = Float(srcBase[srcOff])
+                }
+            }
+        case .float32:
+            let srcBase = fullMel.dataPointer.bindMemory(
+                to: Float.self, capacity: fullMel.count)
+            for b in 0..<melBins {
+                for f in 0..<validFrames {
+                    let srcOff = b * srcStrides[1] + (newMelStart + f) * srcStrides[2]
+                    let dstOff = b * melInputStrides[1] + f * melInputStrides[2]
+                    dstBase[dstOff] = srcBase[srcOff]
+                }
+            }
+        default:
+            throw CosyVoice3Error.predictionFailed(
+                "runHiFT: unexpected Flow mel dtype \(fullMel.dataType.rawValue) (expected fp16 or fp32)"
+            )
+        }
+
+        let numValid = try MLMultiArray(shape: [1], dataType: .int32)
+        numValid[0] = NSNumber(value: Int32(validFrames))
+
+        let features: [String: Any] = [
+            "mel": melInput,
+            "num_valid_frames": numValid,
+        ]
+        let provider = try MLDictionaryFeatureProvider(dictionary: features)
+        let output = try await models.hift.compatPrediction(
+            from: provider, options: MLPredictionOptions())
+
+        guard
+            let audioArr = output.featureValue(for: "audio")?.multiArrayValue,
+            let audioLenArr = output.featureValue(for: "audio_length_samples")?.multiArrayValue
+        else {
+            throw CosyVoice3Error.predictionFailed("hift: missing outputs")
+        }
+        let audioLen = audioLenArr[0].intValue
+        var out = [Float](repeating: 0, count: audioLen)
+        // audio logical shape = [1, 240000]; honor strides.
+        let audioStrides = audioArr.strides.map { $0.intValue }
+        let aBase = audioArr.dataPointer.bindMemory(to: Float.self, capacity: audioArr.count)
+        for i in 0..<audioLen {
+            out[i] = aBase[i * audioStrides[1]]
+        }
+        return out
+    }
+
+    // MARK: - Helpers
+
+    /// Extracts logits for the last real prefill position (`tPre - 1`).
+    /// Prefill output logical shape is `[1, 256, 6761]` fp32; strides may be
+    /// non-compact.
+    private func sliceLastStepLogits(
+        from logits: MLMultiArray,
+        tPre: Int,
+        vocab: Int
+    ) -> [Float] {
+        let strides = logits.strides.map { $0.intValue }
+        // shape = [1, T, V]; row (time) stride is strides[1], vocab stride is strides[2].
+        let rowStride = strides[1]
+        let vocabStride = strides[2]
+        let ptr = logits.dataPointer.bindMemory(to: Float.self, capacity: logits.count)
+        let base = (tPre - 1) * rowStride
+        var out = [Float](repeating: 0, count: vocab)
+        for i in 0..<vocab { out[i] = ptr[base + i * vocabStride] }
+        return out
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Types.swift b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Types.swift
new file mode 100644
index 000000000..14cbf532d
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Pipeline/Synthesize/CosyVoice3Types.swift
@@ -0,0 +1,50 @@
+import Foundation
+
+/// Result of a CosyVoice3 synthesis call.
+public struct CosyVoice3SynthesisResult: Sendable {
+    /// Raw 24 kHz fp32 PCM samples.
+    public let samples: [Float]
+    /// Sample rate (always 24_000).
+    public let sampleRate: Int
+    /// Number of speech tokens the LLM actually generated.
+    public let generatedTokenCount: Int
+    /// Decoded speech token ids (useful for debugging + round-trip).
+    public let decodedTokens: [Int32]
+}
+
+/// Options controlling a CosyVoice3 parity / synthesis call.
+public struct CosyVoice3ParityOptions: Sendable {
+    /// Maximum number of new tokens to generate (capped by `flowTotalTokens - N_prompt`).
+    public let maxNewTokens: Int?
+    /// Sampler seed (for the fallback multinomial path; parity replay overrides this).
+    public let seed: UInt64
+    /// When true, disables sampling and replays the fixture's `decodedTokens`.
+    public let replayDecodedTokens: Bool
+
+    public init(
+        maxNewTokens: Int? = nil,
+        seed: UInt64 = 42,
+        replayDecodedTokens: Bool = true
+    ) {
+        self.maxNewTokens = maxNewTokens
+        self.seed = seed
+        self.replayDecodedTokens = replayDecodedTokens
+    }
+}
+
+/// Options for the Phase 2 text-driven synthesis path.
+///
+/// Thin wrapper around `CosyVoice3ParityOptions` that omits the parity-only
+/// `replayDecodedTokens` flag (text mode always samples).
+public struct CosyVoice3SynthesisOptions: Sendable {
+    /// Maximum number of new speech tokens the LLM may generate (capped by
+    /// `flowTotalTokens - N_prompt` at runtime).
+    public let maxNewTokens: Int?
+    /// Sampler seed for the top-p/top-k + multinomial fallback path.
+    public let seed: UInt64
+
+    public init(maxNewTokens: Int? = nil, seed: UInt64 = 42) {
+        self.maxNewTokens = maxNewTokens
+        self.seed = seed
+    }
+}
diff --git a/Sources/FluidAudio/TTS/CosyVoice3/Shared/SafetensorsReader.swift b/Sources/FluidAudio/TTS/CosyVoice3/Shared/SafetensorsReader.swift
new file mode 100644
index 000000000..2a0e14dba
--- /dev/null
+++ b/Sources/FluidAudio/TTS/CosyVoice3/Shared/SafetensorsReader.swift
@@ -0,0 +1,167 @@
+import Foundation
+
+/// Minimal zero-dependency safetensors parser.
+///
+/// File format (little-endian):
+/// - `u64` header length N
+/// - N bytes of UTF-8 JSON: `{ "<name>": {"dtype": "...", "shape": [...], "data_offsets": [start, end]}, ... }`
+/// - raw tensor payload (referenced by offsets above)
+///
+/// Used for Phase 1 fixture + speech embedding table mmap.
+public final class SafetensorsFile {
+
+    public enum DType: String, Sendable {
+        case f16 = "F16"
+        case bf16 = "BF16"
+        case f32 = "F32"
+        case f64 = "F64"
+        case i8 = "I8"
+        case i16 = "I16"
+        case i32 = "I32"
+        case i64 = "I64"
+        case u8 = "U8"
+        case u16 = "U16"
+        case u32 = "U32"
+        case u64 = "U64"
+        case bool = "BOOL"
+
+        public var byteSize: Int {
+            switch self {
+            case .f16, .bf16, .i16, .u16: return 2
+            case .f32, .i32, .u32: return 4
+            case .f64, .i64, .u64: return 8
+            case .i8, .u8, .bool: return 1
+            }
+        }
+    }
+
+    public struct TensorInfo: Sendable {
+        public let dtype: DType
+        public let shape: [Int]
+        public let dataStart: Int  // absolute offset in file
+        public let dataEnd: Int
+        public var byteCount: Int { dataEnd - dataStart }
+    }
+
+    private let data: Data
+    private let payloadStart: Int
+    public let tensors: [String: TensorInfo]
+
+    public init(url: URL) throws {
+        let data = try Data(contentsOf: url, options: [.alwaysMapped])
+        guard data.count >= 8 else {
+            throw CosyVoice3Error.invalidSafetensors("file smaller than 8 byte header: \(url.path)")
+        }
+        self.data = data
+
+        let headerLen: UInt64 = data.withUnsafeBytes { buf in
+            var v: UInt64 = 0
+            memcpy(&v, buf.baseAddress!, 8)
+            return UInt64(littleEndian: v)
+        }
+        let headerEnd = 8 + Int(headerLen)
+        guard headerEnd <= data.count else {
+            throw CosyVoice3Error.invalidSafetensors(
+                "header length \(headerLen) exceeds file size \(data.count)")
+        }
+        let headerData = data.subdata(in: 8..<headerEnd)
+        self.payloadStart = headerEnd
+
+        guard
+            let json = try JSONSerialization.jsonObject(with: headerData, options: [])
+                as? [String: Any]
+        else {
+            throw CosyVoice3Error.invalidSafetensors("header is not a JSON object")
+        }
+
+        var parsed: [String: TensorInfo] = [:]
+        for (name, value) in json where name != "__metadata__" {
+            guard
+                let entry = value as? [String: Any],
+                let dtypeStr = entry["dtype"] as? String,
+                let dtype = DType(rawValue: dtypeStr),
+                let shape = entry["shape"] as? [Int],
+                let offsets = entry["data_offsets"] as? [Int],
+                offsets.count == 2
+            else {
+                throw CosyVoice3Error.invalidSafetensors("bad entry for tensor \(name)")
+            }
+            parsed[name] = TensorInfo(
+                dtype: dtype,
+                shape: shape,
+                dataStart: payloadStart + offsets[0],
+                dataEnd: payloadStart + offsets[1])
+        }
+        self.tensors = parsed
+    }
+
+    /// Returns the raw bytes slice for a tensor (zero-copy reference into the mmap'd file).
+    public func rawBytes(_ name: String) throws -> Data {
+        guard let info = tensors[name] else {
+            throw CosyVoice3Error.invalidSafetensors("tensor not found: \(name)")
+        }
+        return data.subdata(in: info.dataStart..<info.dataEnd)
+    }
+
+    public func info(_ name: String) throws -> TensorInfo {
+        guard let info = tensors[name] else {
+            throw CosyVoice3Error.invalidSafetensors("tensor not found: \(name)")
+        }
+        return info
+    }
+
+    // MARK: - Typed accessors (copying)
+
+    public func asFloat32(_ name: String) throws -> [Float] {
+        let info = try self.info(name)
+        let bytes = try rawBytes(name)
+        switch info.dtype {
+        case .f32:
+            return bytes.withUnsafeBytes { buf -> [Float] in
+                let count = buf.count / 4
+                let ptr = buf.bindMemory(to: Float.self)
+                return Array(UnsafeBufferPointer(start: ptr.baseAddress, count: count))
+            }
+        case .f64:
+            return bytes.withUnsafeBytes { buf -> [Float] in
+                let count = buf.count / 8
+                let ptr = buf.bindMemory(to: Double.self)
+                return (0..<count).map { Float(ptr[$0]) }
+            }
+        default:
+            throw CosyVoice3Error.invalidSafetensors(
+                "asFloat32 unsupported for dtype \(info.dtype.rawValue)")
+        }
+    }
+
+    public func asInt32(_ name: String) throws -> [Int32] {
+        let info = try self.info(name)
+        let bytes = try rawBytes(name)
+        switch info.dtype {
+        case .i32:
+            return bytes.withUnsafeBytes { buf -> [Int32] in
+                let count = buf.count / 4
+                let ptr = buf.bindMemory(to: Int32.self)
+                return Array(UnsafeBufferPointer(start: ptr.baseAddress, count: count))
+            }
+        case .i64:
+            return bytes.withUnsafeBytes { buf -> [Int32] in
+                let count = buf.count / 8
+                let ptr = buf.bindMemory(to: Int64.self)
+                return (0..<count).map { Int32(truncatingIfNeeded: ptr[$0]) }
+            }
+        default:
+            throw CosyVoice3Error.invalidSafetensors(
+                "asInt32 unsupported for dtype \(info.dtype.rawValue)")
+        }
+    }
+
+    /// Scalar integer (shape [] or [1]), for tensors like `seed` or `t_pre`.
+    public func asInt(_ name: String) throws -> Int {
+        let values = try asInt32(name)
+        guard let first = values.first else {
+            throw CosyVoice3Error.invalidSafetensors("tensor \(name) is empty")
+        }
+        return Int(first)
+    }
+}
diff --git a/Sources/FluidAudio/TTS/TtsBackend.swift b/Sources/FluidAudio/TTS/TtsBackend.swift
index 7a67049b9..aee95bbcf 100644
--- a/Sources/FluidAudio/TTS/TtsBackend.swift
+++ b/Sources/FluidAudio/TTS/TtsBackend.swift
@@ -6,6 +6,15 @@ public enum TtsBackend: Sendable {
     case kokoro
     /// PocketTTS — flow-matching language model, autoregressive streaming synthesis.
     case pocketTts
+    /// CosyVoice3 — Mandarin zero-shot voice cloning via Qwen2 LM + Flow CFM + HiFT.
+    ///
+    /// > Note: **Experimental / beta.** End-to-end synthesis is currently
+    /// > slow (RTFx < 1.0 typical on Apple Silicon). Cause is partially
+    /// > in the Flow CFM stage which must run fp32 on CPU/GPU (fp16 + ANE
+    /// > produces NaNs through fused `layer_norm`) and partially in HiFT
+    /// > sinegen ops that fall back to CPU. May be a model issue, may be
+    /// > recoverable via better conversion — treat as preliminary.
+    case cosyvoice3
     /// laishere/kokoro 7-stage CoreML chain (ALBERT → PostAlbert → Alignment →
     /// Prosody → Noise → Vocoder → Tail) with per-stage ANE/GPU assignment.
     case kokoroAne
diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3/FrontendParityCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3/FrontendParityCommand.swift
new file mode 100644
index 000000000..70520a561
--- /dev/null
+++ b/Sources/FluidAudioCLI/Commands/CosyVoice3/FrontendParityCommand.swift
@@ -0,0 +1,146 @@
+import CoreML
+import FluidAudio
+import Foundation
+
+/// Phase 2 text-frontend parity harness.
+///
+/// Loads `shipping.safetensors` (expected `lm_input_embeds`, `llm_prompt_speech_ids`)
+/// plus its JSON sidecar (`prompt_text`, `tts_text`), tokenizes the text via
+/// `Qwen2BpeTokenizer`, assembles via `CosyVoice3TextFrontend`, and compares
+/// element-wise against the fixture.
+///
+/// Usage:
+/// ```
+/// fluidaudio tts --backend cosyvoice3-frontend-parity \
+///   --tokenizer-dir   .../cosyvoice3_dl/CosyVoice-BlankEN \
+///   --embeddings-file .../build/embeddings/embeddings-fp32.safetensors \
+///   --fixture         .../build/frontend/shipping.safetensors \
+///   --tok-fixture     .../build/frontend/tokenizer_fixture.json
+/// ```
+enum CosyVoice3FrontendParityCLI {
+
+    private static let logger = AppLogger(category: "CosyVoice3FrontendParityCLI")
+
+    static func run(
+        tokenizerDir: String,
+        embeddingsFile: String,
+        fixturePath: String,
+        tokFixturePath: String
+    ) async {
+        let tokURL = URL(
+            fileURLWithPath: (tokenizerDir as NSString).expandingTildeInPath,
+            isDirectory: true)
+        let embURL = URL(fileURLWithPath: (embeddingsFile as NSString).expandingTildeInPath)
+        let fixURL = URL(fileURLWithPath: (fixturePath as NSString).expandingTildeInPath)
+        let tokFixURL = URL(fileURLWithPath: (tokFixturePath as NSString).expandingTildeInPath)
+        let sidecarURL = fixURL.deletingPathExtension().appendingPathExtension("json")
+
+        struct TokFix: Decodable {
+            let special_tokens: [String: Int32]
+        }
+        struct Sidecar: Decodable {
+            let prompt_text: String
+            let tts_text: String
+        }
+
+        do {
+            let tokFix = try JSONDecoder().decode(
+                TokFix.self, from: try Data(contentsOf: tokFixURL))
+            let sidecar = try JSONDecoder().decode(
+                Sidecar.self, from: try Data(contentsOf: sidecarURL))
+
+            let tStart = Date()
+            let tokenizer = try Qwen2BpeTokenizer.load(
+                directory: tokURL, specialTokens: tokFix.special_tokens)
+            let embeddings = try CosyVoice3TextEmbeddings(url: embURL)
+            logger.info(
+                "Loaded tokenizer + text_embedding table in \(String(format: "%.2fs", Date().timeIntervalSince(tStart)))"
+            )
+
+            let fixture = try CosyVoice3FrontendFixture.load(from: fixURL)
+            logger.info("Fixture: T_pre=\(fixture.tPre) N_prompt_speech=\(fixture.promptSpeechIds.count)")
+
+            let frontend = CosyVoice3TextFrontend(tokenizer: tokenizer, embeddings: embeddings)
+            let assembled = try frontend.assemble(
+                promptText: sidecar.prompt_text,
+                ttsText: sidecar.tts_text,
+                promptSpeechIds: fixture.promptSpeechIds)
+
+            print("")
+            print("  swift T_pre     : \(assembled.tPre)")
+            print("  fixture T_pre   : \(fixture.tPre)")
+
+            guard assembled.tPre == fixture.tPre else {
+                print("T_pre mismatch — tokenization diverged.")
+                exit(1)
+            }
+
+            // Element-wise comparison: fixture is compact fp32, swift array
+            // may have padded strides.
+            let dim = CosyVoice3Constants.embedDim
+            let strides = assembled.lmInputEmbeds.strides.map { $0.intValue }
+            let ptr = assembled.lmInputEmbeds.dataPointer.bindMemory(
+                to: Float.self, capacity: assembled.lmInputEmbeds.count)
+            var maxAbs: Double = 0
+            var maxAt: (t: Int, d: Int) = (0, 0)
+            var sumAbs: Double = 0
+            var rowMax = [Double](repeating: 0, count: assembled.tPre)
+            let n = assembled.tPre * dim
+            for t in 0..<assembled.tPre {
+                for d in 0..<dim {
+                    let got = ptr[t * strides[1] + d * strides[2]]
+                    let exp = fixture.lmInputEmbeds[t * dim + d]
+                    let diff = Double(got) - Double(exp)
+                    let a = abs(diff)
+                    sumAbs += a
+                    if a > rowMax[t] { rowMax[t] = a }
+                    if a > maxAbs {
+                        maxAbs = a
+                        maxAt = (t, d)
+                    }
+                }
+            }
+            let mae = sumAbs / Double(n)
+            print("  MAE             : \(String(format: "%.6e", mae))")
+            print("  max|Δ|          : \(String(format: "%.6e", maxAbs)) at (t=\(maxAt.t), d=\(maxAt.d))")
+
+            // Show the top-5 worst rows to see if divergence is concentrated
+            // at sos (t=0), task_id (t=1+nText), or specific text/speech rows.
+            let N_speech = fixture.promptSpeechIds.count
+            let nText = assembled.tPre - 2 - N_speech
+            print(
+                "  layout          : sos@0  text@1..\(nText)  task@\(1 + nText)  speech@\(2 + nText)..\(assembled.tPre - 1)"
+            )
+            let ranked = rowMax.enumerated().sorted { $0.element > $1.element }.prefix(5)
+            print("  top rows:")
+            for (t, m) in ranked {
+                let slot: String
+                if t == 0 {
+                    slot = "sos"
+                } else if t == 1 + nText {
+                    slot = "task_id"
+                } else if t < 1 + nText {
+                    slot = "text[\(t - 1)]"
+                } else {
+                    slot = "speech[\(t - 2 - nText)]"
+                }
+                print(
+                    "    t=\(t)  \(slot.padding(toLength: 12, withPad: " ", startingAt: 0))  max|Δ|=\(String(format: "%.6e", m))"
+                )
+            }
+
+            // Compare Swift's reconstructed token ids for sanity.
+            print("  swift textToken ids (first 10): \(assembled.textTokenIds.prefix(10).map { $0 })")
+            print("  swift textToken ids (last 5) : \(assembled.textTokenIds.suffix(5).map { $0 })")
+
+            if maxAbs > 1e-4 {
+                print("parity tolerance exceeded (max|Δ| > 1e-4)")
+                exit(1)
+            }
+            print("frontend parity OK")
+        } catch {
+            logger.error("Frontend parity failed: \(error)")
+            exit(2)
+        }
+    }
+}
diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3/ParityCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3/ParityCommand.swift
new file mode 100644
index 000000000..020a10f0c
--- /dev/null
+++ b/Sources/FluidAudioCLI/Commands/CosyVoice3/ParityCommand.swift
@@ -0,0 +1,203 @@
+import CoreML
+import FluidAudio
+import Foundation
+
+/// Phase 1 parity harness CLI for the CosyVoice3 Swift port.
+///
+/// Usage:
+/// ```
+/// fluidaudio tts --backend cosyvoice3-parity \
+///   --fixture    .../build/frontend/shipping.safetensors \
+///   --models-dir .../coreml/build \
+///   --reference  .../build/wavs/e2e_shipping.wav \
+///   --output     .../build/swift_e2e.wav \
+///   --seed 42
+/// ```
+@available(macOS 15, iOS 18, *)
+enum CosyVoice3ParityCLI {
+
+    private static let logger = AppLogger(category: "CosyVoice3ParityCLI")
+
+    static func run(
+        fixturePath: String,
+        modelsDir: String,
+        referencePath: String?,
+        outputPath: String,
+        seed: UInt64,
+        cpuOnly: Bool,
+        replayTokens: Bool
+    ) async {
+        let fixtureURL = URL(fileURLWithPath: (fixturePath as NSString).expandingTildeInPath)
+        let modelsURL = URL(
+            fileURLWithPath: (modelsDir as NSString).expandingTildeInPath, isDirectory: true)
+        let outputURL = URL(fileURLWithPath: (outputPath as NSString).expandingTildeInPath)
+
+        let computeUnits: MLComputeUnits = cpuOnly ? .cpuOnly : .cpuAndNeuralEngine
+        let manager = CosyVoice3TtsManager(directory: modelsURL, computeUnits: computeUnits)
+
+        do {
+            let tLoad = Date()
+            try await manager.initialize()
+            logger.info(
+                "Loaded CosyVoice3 models in \(String(format: "%.2f", Date().timeIntervalSince(tLoad)))s"
+            )
+
+            let options = CosyVoice3ParityOptions(
+                maxNewTokens: nil, seed: seed, replayDecodedTokens: replayTokens)
+
+            let tSynth = Date()
+            let result = try await manager.synthesizeFromFixture(
+                fixtureURL: fixtureURL, options: options)
+            let synthElapsed = Date().timeIntervalSince(tSynth)
+            let audioSec = Double(result.samples.count) / Double(result.sampleRate)
+            let rtfx = audioSec / synthElapsed
+            logger.info(
+                "Synthesized \(result.samples.count) samples (\(String(format: "%.2fs", audioSec))) in \(String(format: "%.2fs", synthElapsed))"
+            )
+            print(
+                String(
+                    format:
+                        "RTFX audio=%.3fs synth=%.3fs RTFx=%.3fx tokens=%d",
+                    audioSec, synthElapsed, rtfx, result.generatedTokenCount))
+
+            try writeWAV(samples: result.samples, sampleRate: result.sampleRate, to: outputURL)
+            logger.info("Wrote WAV: \(outputURL.path)")
+
+            if let refPath = referencePath {
+                let refURL = URL(
+                    fileURLWithPath: (refPath as NSString).expandingTildeInPath)
+                let refSamples = try readWAVMono(url: refURL)
+                let metrics = compareWaveforms(
+                    swift: result.samples, reference: refSamples)
+                print("")
+                print(
+                    "  reference samples : \(refSamples.count)  swift samples : \(result.samples.count)"
+                )
+                print(
+                    "  MAE               : \(String(format: "%.6f", metrics.mae))")
+                print(
+                    "  max|Δ|            : \(String(format: "%.6f", metrics.maxAbsDiff))")
+                print("  SNR               : \(String(format: "%.2f dB", metrics.snrDb))")
+                if metrics.maxAbsDiff > 1e-3 {
+                    logger.warning(
+                        "Parity tolerance exceeded: max|Δ|=\(metrics.maxAbsDiff) > 1e-3")
+                    exit(1)
+                }
+            }
+        } catch {
+            logger.error("CosyVoice3 parity harness failed: \(error)")
+            exit(2)
+        }
+    }
+
+    // MARK: - WAV IO (un-normalized)
+
+    private static func writeWAV(samples: [Float], sampleRate: Int, to url: URL) throws {
+        // Clamp to [-1, 1] to avoid int16 overflow; do NOT rescale to max=1.
+        let numSamples = samples.count
+        let byteRate = sampleRate * 2
+        let dataSize = numSamples * 2
+        var header = Data()
+        header.append("RIFF".data(using: .ascii)!)
+        header.appendUInt32LE(UInt32(36 + dataSize))
+        header.append("WAVE".data(using: .ascii)!)
+        header.append("fmt ".data(using: .ascii)!)
+        header.appendUInt32LE(16)
+        header.appendUInt16LE(1)  // PCM
+        header.appendUInt16LE(1)  // mono
+        header.appendUInt32LE(UInt32(sampleRate))
+        header.appendUInt32LE(UInt32(byteRate))
+        header.appendUInt16LE(2)  // block align
+        header.appendUInt16LE(16)  // bits/sample
+        header.append("data".data(using: .ascii)!)
+        header.appendUInt32LE(UInt32(dataSize))
+
+        var pcm = Data(capacity: dataSize)
+        for s in samples {
+            let clipped = max(-1.0, min(1.0, s))
+            let i16 = Int16(clipped * 32_767.0)
+            var le = i16.littleEndian
+            Swift.withUnsafeBytes(of: &le) { pcm.append(contentsOf: $0) }
+        }
+        try (header + pcm).write(to: url)
+    }
+
+    private static func readWAVMono(url: URL) throws -> [Float] {
+        let data = try Data(contentsOf: url)
+        guard data.count > 44 else {
+            throw CocoaError(.fileReadCorruptFile)
+        }
+        // Find 'data' chunk.
+        var offset = 12
+        var dataStart = -1
+        var dataSize = 0
+        while offset + 8 <= data.count {
+            let id = data.subdata(in: offset..<offset + 4)
+            let size = data.subdata(in: offset + 4..<offset + 8).readUInt32LE()
+            if id == "data".data(using: .ascii) {
+                dataStart = offset + 8
+                dataSize = Int(size)
+                break
+            }
+            offset += 8 + Int(size)
+        }
+        guard dataStart > 0 else { throw CocoaError(.fileReadCorruptFile) }
+        let pcm = data.subdata(in: dataStart..<min(dataStart + dataSize, data.count))
+        let count = pcm.count / 2
+        var out = [Float](repeating: 0, count: count)
+        pcm.withUnsafeBytes { buf in
+            let ptr = buf.bindMemory(to: Int16.self)
+            for i in 0..<count {
+                out[i] = Float(ptr[i]) / 32_768.0
+            }
+        }
+        return out
+    }
+
+    // MARK: - Metrics
+
+    struct WaveformMetrics {
+        let mae: Double
+        let maxAbsDiff: Double
+        let snrDb: Double
+    }
+
+    private static func compareWaveforms(swift: [Float], reference: [Float]) -> WaveformMetrics {
+        let n = min(swift.count, reference.count)
+        guard n > 0 else { return WaveformMetrics(mae: .infinity, maxAbsDiff: .infinity, snrDb: -.infinity) }
+        var sumAbs: Double = 0
+        var maxAbs: Double = 0
+        var sumSigSq: Double = 0
+        var sumErrSq: Double = 0
+        for i in 0..<n {
+            let diff = Double(swift[i]) - Double(reference[i])
+            let a = abs(diff)
+            sumAbs += a
+            if a > maxAbs { maxAbs = a }
+            sumSigSq += Double(reference[i]) * Double(reference[i])
+            sumErrSq += diff * diff
+        }
+        let snr = sumErrSq > 0 ? 10 * log10(sumSigSq / sumErrSq) : .infinity
+        return WaveformMetrics(mae: sumAbs / Double(n), maxAbsDiff: maxAbs, snrDb: snr)
+    }
+}
+
+// MARK: - Data helpers
+
+extension Data {
+    fileprivate mutating func appendUInt32LE(_ v: UInt32) {
+        var le = v.littleEndian
+        Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) }
+    }
+    fileprivate mutating func appendUInt16LE(_ v: UInt16) {
+        var le = v.littleEndian
+        Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) }
+    }
+    fileprivate func readUInt32LE() -> UInt32 {
+        self.withUnsafeBytes { buf -> UInt32 in
+            var v: UInt32 = 0
+            memcpy(&v, buf.baseAddress!, 4)
+            return UInt32(littleEndian: v)
+        }
+    }
+}
diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3/TextCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3/TextCommand.swift
new file mode 100644
index 000000000..cbf64d92d
--- /dev/null
+++ b/Sources/FluidAudioCLI/Commands/CosyVoice3/TextCommand.swift
@@ -0,0 +1,136 @@
+import CoreML
+import FluidAudio
+import Foundation
+
+/// Phase 2 text-driven synthesis CLI for the CosyVoice3 Swift port.
+///
+/// Drives `CosyVoice3TtsManager.synthesize(text:promptAssets:options:)` end
+/// to end: tokenizer + frontend + LLM + Flow + HiFT, writing a 24 kHz WAV.
+///
+/// Usage:
+/// ```
+/// fluidaudio tts --backend cosyvoice3-text \
+///   --text "希望你以后能够做的比我还好用" \
+///   --models-dir          .../coreml/build \
+///   --tokenizer-dir       .../cosyvoice3_dl/CosyVoice-BlankEN \
+///   --embeddings-file     .../build/embeddings/embeddings-runtime-fp32.safetensors \
+///   --special-tokens-file .../build/frontend/tokenizer_fixture.json \
+///   --prompt-assets       .../build/frontend/shipping.safetensors \
+///   --output              .../build/swift_cv3_text.wav \
+///   --seed 42
+/// ```
+@available(macOS 15, iOS 18, *)
+enum CosyVoice3TextCLI {
+
+    private static let logger = AppLogger(category: "CosyVoice3TextCLI")
+
+    static func run(
+        text: String,
+        modelsDir: String,
+        tokenizerDir: String,
+        embeddingsFile: String,
+        specialTokensFile: String,
+        promptAssetsPath: String,
+        outputPath: String,
+        seed: UInt64,
+        maxNewTokens: Int?,
+        cpuOnly: Bool
+    ) async {
+        let modelsURL = URL(
+            fileURLWithPath: (modelsDir as NSString).expandingTildeInPath, isDirectory: true)
+        let tokURL = URL(
+            fileURLWithPath: (tokenizerDir as NSString).expandingTildeInPath, isDirectory: true)
+        let embURL = URL(fileURLWithPath: (embeddingsFile as NSString).expandingTildeInPath)
+        let specURL = URL(fileURLWithPath: (specialTokensFile as NSString).expandingTildeInPath)
+        let promptURL = URL(fileURLWithPath: (promptAssetsPath as NSString).expandingTildeInPath)
+        let outputURL = URL(fileURLWithPath: (outputPath as NSString).expandingTildeInPath)
+
+        let computeUnits: MLComputeUnits = cpuOnly ? .cpuOnly : .cpuAndNeuralEngine
+        let manager = CosyVoice3TtsManager(
+            modelsDirectory: modelsURL,
+            tokenizerDirectory: tokURL,
+            textEmbeddingsFile: embURL,
+            specialTokensFile: specURL,
+            computeUnits: computeUnits)
+
+        do {
+            let tLoad = Date()
+            try await manager.initialize()
+            logger.info(
+                "Loaded CosyVoice3 models + frontend in \(String(format: "%.2f", Date().timeIntervalSince(tLoad)))s"
+            )
+
+            let tPrompt = Date()
+            let promptAssets = try CosyVoice3PromptAssets.load(from: promptURL)
+            logger.info(
+                "Loaded prompt assets in \(String(format: "%.2f", Date().timeIntervalSince(tPrompt)))s — N_speech=\(promptAssets.promptSpeechIds.count), mel_frames=\(promptAssets.promptMelFrames)"
+            )
+
+            let options = CosyVoice3SynthesisOptions(
+                maxNewTokens: maxNewTokens, seed: seed)
+
+            let tSynth = Date()
+            let result = try await manager.synthesize(
+                text: text, promptAssets: promptAssets, options: options)
+            let synthElapsed = Date().timeIntervalSince(tSynth)
+            let audioSecs = Double(result.samples.count) / Double(result.sampleRate)
+            let rtfx = synthElapsed > 0 ? audioSecs / synthElapsed : 0
+            logger.info(
+                "Synthesized \(result.samples.count) samples (\(String(format: "%.2fs", audioSecs))) in \(String(format: "%.2fs", synthElapsed)) — RTFx \(String(format: "%.2fx", rtfx))"
+            )
+            logger.info("Generated \(result.generatedTokenCount) speech tokens")
+
+            try FileManager.default.createDirectory(
+                at: outputURL.deletingLastPathComponent(),
+                withIntermediateDirectories: true)
+            try writeWAV(samples: result.samples, sampleRate: result.sampleRate, to: outputURL)
+            logger.info("Wrote WAV: \(outputURL.path)")
+        } catch {
+            logger.error("CosyVoice3 text synthesis failed: \(error)")
+            exit(2)
+        }
+    }
+
+    private static func writeWAV(samples: [Float], sampleRate: Int, to url: URL) throws {
+        let numSamples = samples.count
+        let byteRate = sampleRate * 2
+        let dataSize = numSamples * 2
+        var header = Data()
+        header.append("RIFF".data(using: .ascii)!)
+        header.appendUInt32LE(UInt32(36 + dataSize))
+        header.append("WAVE".data(using: .ascii)!)
+        header.append("fmt ".data(using: .ascii)!)
+        header.appendUInt32LE(16)
+        header.appendUInt16LE(1)  // PCM
+        header.appendUInt16LE(1)  // mono
+        header.appendUInt32LE(UInt32(sampleRate))
+        header.appendUInt32LE(UInt32(byteRate))
+        header.appendUInt16LE(2)  // block align
+        header.appendUInt16LE(16)  // bits/sample
+        header.append("data".data(using: .ascii)!)
+        header.appendUInt32LE(UInt32(dataSize))
+
+        var pcm = Data(capacity: dataSize)
+        for s in samples {
+            let clipped = max(-1.0, min(1.0, s))
+            let i16 = Int16(clipped * 32_767.0)
+            var le = i16.littleEndian
+            Swift.withUnsafeBytes(of: &le) { pcm.append(contentsOf: $0) }
+        }
+        try (header + pcm).write(to: url)
+    }
+}
+
+// MARK: - Data helpers (file-scoped duplicate of the helpers in
+// CosyVoice3ParityCommand.swift; kept here so this file compiles on its own).
+
+extension Data {
+    fileprivate mutating func appendUInt32LE(_ v: UInt32) {
+        var le = v.littleEndian
+        Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) }
+    }
+    fileprivate mutating func appendUInt16LE(_ v: UInt16) {
+        var le = v.littleEndian
+        Swift.withUnsafeBytes(of: &le) { self.append(contentsOf: $0) }
+    }
+}
diff --git a/Sources/FluidAudioCLI/Commands/CosyVoice3/TokenizerParityCommand.swift b/Sources/FluidAudioCLI/Commands/CosyVoice3/TokenizerParityCommand.swift
new file mode 100644
index 000000000..d5550c60c
--- /dev/null
+++ b/Sources/FluidAudioCLI/Commands/CosyVoice3/TokenizerParityCommand.swift
@@ -0,0 +1,70 @@
+import FluidAudio
+import Foundation
+
+/// Phase 2 tokenizer parity harness.
+///
+/// Loads the Python-exported tokenizer_fixture.json (special token map + test
+/// cases) and asserts the Swift Qwen2BpeTokenizer produces the same ID stream
+/// for every case.
+///
+/// Usage:
+/// ```
+/// fluidaudio tts --backend cosyvoice3-tokenizer-parity \
+///   --tokenizer-dir .../cosyvoice3_dl/CosyVoice-BlankEN \
+///   --fixture       .../build/frontend/tokenizer_fixture.json
+/// ```
+enum CosyVoice3TokenizerParityCLI {
+
+    private static let logger = AppLogger(category: "CosyVoice3TokenizerParityCLI")
+
+    static func run(tokenizerDir: String, fixturePath: String) async {
+        let tokURL = URL(
+            fileURLWithPath: (tokenizerDir as NSString).expandingTildeInPath,
+            isDirectory: true)
+        let fixURL = URL(fileURLWithPath: (fixturePath as NSString).expandingTildeInPath)
+
+        struct Fixture: Decodable {
+            let special_tokens: [String: Int32]
+            let cases: [Case]
+            struct Case: Decodable {
+                let text: String
+                let ids: [Int32]
+            }
+        }
+
+        do {
+            let data = try Data(contentsOf: fixURL)
+            let fixture = try JSONDecoder().decode(Fixture.self, from: data)
+            let tokenizer = try Qwen2BpeTokenizer.load(
+                directory: tokURL, specialTokens: fixture.special_tokens)
+
+            var passed = 0
+            var failed = 0
+            var firstFail: (String, [Int32], [Int32])? = nil
+            for tc in fixture.cases {
+                let got = tokenizer.encode(tc.text)
+                if got == tc.ids {
+                    passed += 1
+                } else {
+                    failed += 1
+                    if firstFail == nil {
+                        firstFail = (tc.text, tc.ids, got)
+                    }
+                }
+            }
+
+            print("cases: \(passed + failed)  passed: \(passed)  failed: \(failed)")
+            if let (text, expected, got) = firstFail {
+                print("")
+                print("first mismatch:")
+                print("  text     : \(text.debugDescription)")
+                print("  expected : \(expected)")
+                print("  got      : \(got)")
+            }
+            if failed > 0 { exit(1) }
+        } catch {
+            logger.error("Tokenizer parity failed: \(error)")
+            exit(2)
+        }
+    }
+}
diff --git a/Sources/FluidAudioCLI/Commands/TTSCommand.swift b/Sources/FluidAudioCLI/Commands/TTSCommand.swift
index 0b1c781d0..132fcc8d0 100644
--- a/Sources/FluidAudioCLI/Commands/TTSCommand.swift
+++ b/Sources/FluidAudioCLI/Commands/TTSCommand.swift
@@ -137,6 +137,25 @@ public struct TTS {
         var cloneVoicePath: String? = nil
         var voiceFilePath: String? = nil
         var saveVoicePath: String? = nil
+        // CosyVoice3 Phase 1 parity harness args.
+        var cv3FixturePath: String? = nil
+        var cv3ModelsDir: String? = nil
+        var cv3ReferencePath: String? = nil
+        var cv3Seed: UInt64 = 42
+        var cv3CpuOnly: Bool = false
+        var cv3ReplayTokens: Bool = true
+        // CosyVoice3 Phase 2 tokenizer parity args.
+        var cv3TokenizerDir: String? = nil
+        var cv3TokenizerParityMode: Bool = false
+        // CosyVoice3 Phase 2 frontend parity args.
+        var cv3FrontendParityMode: Bool = false
+        var cv3EmbeddingsFile: String? = nil
+        var cv3TokFixturePath: String? = nil
+        // CosyVoice3 Phase 2 text-driven synthesis args.
+        var cv3TextMode: Bool = false
+        var cv3SpecialTokensFile: String? = nil
+        var cv3PromptAssetsPath: String? = nil
+        var cv3MaxNewTokens: Int? = nil
         var pocketLanguage: PocketTtsLanguage = .english
         // PocketTTS deterministic-seed mode (uses session API for fixed RNG).
         var pocketSeed: UInt64? = nil
@@ -194,6 +213,22 @@ public struct TTS {
                         backend = .kokoro
                     case "pocket", "pockettts":
                         backend = .pocketTts
+                    case "cosyvoice3", "cv3", "cosyvoice3-text", "cv3-text":
+                        // Production text-driven synthesis is the default
+                        // user-facing path. The explicit `*-text` aliases
+                        // are kept for backward compatibility with earlier
+                        // documentation.
+                        backend = .cosyvoice3
+                        cv3TextMode = true
+                    case "cosyvoice3-parity", "cv3-parity":
+                        // Phase 1 fixture parity harness — opt-in dev mode.
+                        backend = .cosyvoice3
+                    case "cosyvoice3-tokenizer-parity", "cv3-tokenizer":
+                        backend = .cosyvoice3
+                        cv3TokenizerParityMode = true
+                    case "cosyvoice3-frontend-parity", "cv3-frontend":
+                        backend = .cosyvoice3
+                        cv3FrontendParityMode = true
                     case "kokoro-ane", "kokoroane", "lai":
                         backend = .kokoroAne
                     default:
@@ -201,6 +236,65 @@ public struct TTS {
                     }
                     i += 1
                 }
+            case "--fixture":
+                if i + 1 < arguments.count {
+                    cv3FixturePath = arguments[i + 1]
+                    i += 1
+                }
+            case "--models-dir":
+                if i + 1 < arguments.count {
+                    cv3ModelsDir = arguments[i + 1]
+                    i += 1
+                }
+            case "--reference":
+                if i + 1 < arguments.count {
+                    cv3ReferencePath = arguments[i + 1]
+                    i += 1
+                }
+            case "--seed":
+                if i + 1 < arguments.count {
+                    cv3Seed = UInt64(arguments[i + 1]) ?? 42
+                    i += 1
+                }
+            case "--cpu-only":
+                cv3CpuOnly = true
+            case "--no-replay":
+                cv3ReplayTokens = false
+            case "--tokenizer-dir":
+                if i + 1 < arguments.count {
+                    cv3TokenizerDir = arguments[i + 1]
+                    i += 1
+                }
+            case "--embeddings-file":
+                if i + 1 < arguments.count {
+                    cv3EmbeddingsFile = arguments[i + 1]
+                    i += 1
+                }
+            case "--tok-fixture":
+                if i + 1 < arguments.count {
+                    cv3TokFixturePath = arguments[i + 1]
+                    i += 1
+                }
+            case "--special-tokens-file":
+                if i + 1 < arguments.count {
+                    cv3SpecialTokensFile = arguments[i + 1]
+                    i += 1
+                }
+            case "--prompt-assets":
+                if i + 1 < arguments.count {
+                    cv3PromptAssetsPath = arguments[i + 1]
+                    i += 1
+                }
+            case "--text":
+                if i + 1 < arguments.count {
+                    text = arguments[i + 1]
+                    i += 1
+                }
+            case "--max-new-tokens":
+                if i + 1 < arguments.count {
+                    cv3MaxNewTokens = Int(arguments[i + 1])
+                    i += 1
+                }
             case "--auto-download":
                 // No-op: downloads are always ensured by the CLI. Accepted
                 // for backward compatibility with documented examples.
@@ -267,6 +361,101 @@ public struct TTS {
             return
         }
 
+        if backend == .cosyvoice3 {
+            logger.warning(
+                "CosyVoice3 backend is experimental / beta — synthesis is "
+                    + "slow (RTFx < 1.0 typical). Performance may improve in "
+                    + "later releases.")
+        }
+
+        if backend == .cosyvoice3 && cv3TokenizerParityMode {
+            guard let tokDir = cv3TokenizerDir, let fixture = cv3FixturePath else {
+                logger.error(
+                    "cosyvoice3-tokenizer-parity requires --tokenizer-dir <.../CosyVoice-BlankEN> and --fixture <tokenizer_fixture.json>"
+                )
+                return
+            }
+            await CosyVoice3TokenizerParityCLI.run(
+                tokenizerDir: tokDir, fixturePath: fixture)
+            return
+        }
+
+        if backend == .cosyvoice3 && cv3FrontendParityMode {
+            guard
+                let tokDir = cv3TokenizerDir,
+                let embFile = cv3EmbeddingsFile,
+                let fixture = cv3FixturePath,
+                let tokFix = cv3TokFixturePath
+            else {
+                logger.error(
+                    "cosyvoice3-frontend-parity requires --tokenizer-dir, --embeddings-file, --fixture <shipping.safetensors>, --tok-fixture"
+                )
+                return
+            }
+            await CosyVoice3FrontendParityCLI.run(
+                tokenizerDir: tokDir,
+                embeddingsFile: embFile,
+                fixturePath: fixture,
+                tokFixturePath: tokFix)
+            return
+        }
+
+        if backend == .cosyvoice3 && cv3TextMode {
+            guard
+                let inputText = text,
+                let modelsDir = cv3ModelsDir,
+                let tokDir = cv3TokenizerDir,
+                let embFile = cv3EmbeddingsFile,
+                let specFile = cv3SpecialTokensFile,
+                let promptAssets = cv3PromptAssetsPath
+            else {
+                logger.error(
+                    "cosyvoice3-text requires --text <text>, --models-dir, --tokenizer-dir, --embeddings-file, --special-tokens-file, --prompt-assets"
+                )
+                return
+            }
+            if #available(macOS 15, iOS 18, *) {
+                await CosyVoice3TextCLI.run(
+                    text: inputText,
+                    modelsDir: modelsDir,
+                    tokenizerDir: tokDir,
+                    embeddingsFile: embFile,
+                    specialTokensFile: specFile,
+                    promptAssetsPath: promptAssets,
+                    outputPath: output,
+                    seed: cv3Seed,
+                    maxNewTokens: cv3MaxNewTokens,
+                    cpuOnly: cv3CpuOnly)
+            } else {
+                logger.error(
+                    "CosyVoice3 requires macOS 15 / iOS 18 (uses CoreML MLState).")
+            }
+            return
+        }
+
+        if backend == .cosyvoice3 {
+            guard let fixture = cv3FixturePath, let modelsDir = cv3ModelsDir else {
+                logger.error(
+                    "cosyvoice3-parity requires --fixture <shipping.safetensors> and --models-dir <build/>"
+                )
+                return
+            }
+            if #available(macOS 15, iOS 18, *) {
+                await CosyVoice3ParityCLI.run(
+                    fixturePath: fixture,
+                    modelsDir: modelsDir,
+                    referencePath: cv3ReferencePath,
+                    outputPath: output,
+                    seed: cv3Seed,
+                    cpuOnly: cv3CpuOnly,
+                    replayTokens: cv3ReplayTokens)
+            } else {
+                logger.error(
+                    "CosyVoice3 requires macOS 15 / iOS 18 (uses CoreML MLState).")
+            }
+            return
+        }
+
         guard let text = text else {
             printUsage()
             return
@@ -863,7 +1052,14 @@ public struct TTS {
             Options:
               --output, -o         Output WAV path (default: output.wav)
               --voice, -v          Voice name (default: af_heart for Kokoro, alba for PocketTTS)
-              --backend            TTS backend: kokoro (default), pocket, or kokoro-ane
+              --backend            TTS backend: kokoro (default), pocket, kokoro-ane,
+                                   or cosyvoice3 [BETA — slow, RTFx < 1.0]
+                                   CosyVoice3 dev sub-backends:
+                                     cosyvoice3-parity            Phase 1 fixture parity harness
+                                     cosyvoice3-frontend-parity   lm_input_embeds parity vs Python
+                                     cosyvoice3-tokenizer-parity  Qwen2 BPE round-trip
+                                   (Production cosyvoice3 backend auto-downloads
+                                    assets from HuggingFace on first synthesis.)
               --lexicon, -l        Custom pronunciation lexicon file (word=phonemes format, Kokoro only)
               --benchmark          Run a predefined benchmarking suite with multiple sentences
               --variant            Force Kokoro 5s or 15s model (values: 5s,15s)
diff --git a/Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift b/Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift
new file mode 100644
index 000000000..e94184c40
--- /dev/null
+++ b/Tests/FluidAudioTests/TTS/CosyVoice3ChineseNormalizerTests.swift
@@ -0,0 +1,81 @@
+import XCTest
+
+@testable import FluidAudio
+
+final class CosyVoice3ChineseNormalizerTests: XCTestCase {
+
+    func testContainsChinese() {
+        XCTAssertTrue(CosyVoice3ChineseNormalizer.containsChinese("你好"))
+        XCTAssertTrue(CosyVoice3ChineseNormalizer.containsChinese("hello 世界"))
+        XCTAssertFalse(CosyVoice3ChineseNormalizer.containsChinese("hello world"))
+        XCTAssertFalse(CosyVoice3ChineseNormalizer.containsChinese(""))
+    }
+
+    func testReplaceBlankDropsCjkInteriorSpaces() {
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.replaceBlank("中 国"), "中国")
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.replaceBlank("hello world"), "hello world")
+        // Mixed: space between ASCII and CJK is dropped (one side non-ASCII).
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.replaceBlank("hi 你好"), "hi你好")
+    }
+
+    func testReplaceCornerMark() {
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.replaceCornerMark("面积 5m²"),
+            "面积 5m平方")
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.replaceCornerMark("体积 2m³"),
+            "体积 2m立方")
+    }
+
+    func testRemoveBracket() {
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.removeBracket("你好（世界）"),
+            "你好世界")
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.removeBracket("【注意】请勿触摸"),
+            "注意请勿触摸")
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.removeBracket("a——b"),
+            "a b")
+    }
+
+    func testSpellOutDigitsZh() {
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.spellOutDigitsZh("2024年"),
+            "二零二四年")
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.spellOutDigitsZh("abc"),
+            "abc")
+    }
+
+    func testStripTrailingCommaLikes() {
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.stripTrailingCommaLikes("你好，，"),
+            "你好。")
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.stripTrailingCommaLikes("你好、,，"),
+            "你好。")
+        XCTAssertEqual(
+            CosyVoice3ChineseNormalizer.stripTrailingCommaLikes("你好。"),
+            "你好。")
+    }
+
+    func testNormalizeEndToEnd() {
+        let input = "希望你以后能够做的比我还好用. 2024年,,"
+        let out = CosyVoice3ChineseNormalizer.normalize(input)
+        // Period becomes 。, trailing commas collapse to a single 。, digits
+        // spelled out per-char, internal spaces between CJK stripped.
+        XCTAssertEqual(out, "希望你以后能够做的比我还好用。二零二四年。")
+    }
+
+    func testIsOnlyPunctuation() {
+        XCTAssertTrue(CosyVoice3ChineseNormalizer.isOnlyPunctuation(""))
+        XCTAssertTrue(CosyVoice3ChineseNormalizer.isOnlyPunctuation("。，！？"))
+        XCTAssertTrue(CosyVoice3ChineseNormalizer.isOnlyPunctuation(".,!?"))
+        XCTAssertFalse(CosyVoice3ChineseNormalizer.isOnlyPunctuation("你好"))
+        XCTAssertFalse(CosyVoice3ChineseNormalizer.isOnlyPunctuation("abc"))
+    }
+}
diff --git a/Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift b/Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift
new file mode 100644
index 000000000..e904d64ff
--- /dev/null
+++ b/Tests/FluidAudioTests/TTS/CosyVoice3PromptMelTests.swift
@@ -0,0 +1,101 @@
+import XCTest
+
+@testable import FluidAudio
+
+final class CosyVoice3PromptMelTests: XCTestCase {
+
+    func testFrameCountMatchesMatchaFormula() throws {
+        // matcha/cosyvoice3: pad by 720 each side (reflect), center=False.
+        // For 48000 samples: padded = 48000 + 1440 = 49440.
+        // frames = (49440 - 1920) / 480 + 1 = 99 + 1 = 100.
+        let mel = CosyVoice3PromptMel()
+        let audio = [Float](repeating: 0.01, count: 48_000)
+        let out = try mel.compute(audio: audio)
+        XCTAssertEqual(out.frames, 100)
+        XCTAssertEqual(out.mel.count, 100 * 80)
+    }
+
+    func testZeroAudioClampsToLogFloor() throws {
+        // With audio of all zeros, mel values are 0 → clamped to 1e-5 → log = -11.5129...
+        let mel = CosyVoice3PromptMel()
+        let audio = [Float](repeating: 0, count: 24_000)
+        let out = try mel.compute(audio: audio)
+        let expected: Float = log(Float(1e-5))
+        for v in out.mel {
+            XCTAssertEqual(v, expected, accuracy: 1e-5)
+        }
+    }
+
+    func testSinePeakInLowMelBins() throws {
+        // 200 Hz sine at 24 kHz should light up one of the lowest mel bins
+        // (fmin=0, the first few triangles cover 0..~200 Hz).
+        let mel = CosyVoice3PromptMel()
+        let sr: Float = 24_000
+        let f: Float = 200
+        let n = 12_000  // 0.5 s
+        var audio = [Float](repeating: 0, count: n)
+        for i in 0..<n {
+            audio[i] = 0.5 * sin(2 * .pi * f * Float(i) / sr)
+        }
+        let out = try mel.compute(audio: audio)
+        XCTAssertGreaterThan(out.frames, 0)
+        // Average energy per mel bin
+        var perBin = [Float](repeating: 0, count: 80)
+        for frame in 0..<out.frames {
+            for m in 0..<80 {
+                perBin[m] += out.mel[frame * 80 + m]
+            }
+        }
+        let argmax = perBin.enumerated().max(by: { $0.1 < $1.1 })!.offset
+        // 200 Hz sits in the bottom ~10 Slaney-mel triangles with the 24 kHz
+        // configuration (~linear below 1000 Hz). Accept any bin <20.
+        XCTAssertLessThan(argmax, 20, "expected peak in low mel bins, got \(argmax)")
+    }
+
+    func testReflectPad() {
+        // Manual reflect pad check mirroring PyTorch's F.pad(reflect).
+        let y: [Float] = [1, 2, 3, 4, 5]
+        let p = CosyVoice3PromptMel.reflectPad(y, pad: 2)
+        // left: y[2], y[1] → 3, 2; core: 1,2,3,4,5; right: y[3], y[2] → 4, 3
+        XCTAssertEqual(p, [3, 2, 1, 2, 3, 4, 5, 4, 3])
+    }
+
+    func testHannWindowPeriodicEndpoints() {
+        let w = CosyVoice3PromptMel.hannWindowPeriodic(length: 4)
+        // 0.5 * (1 - cos(2πi/4)) for i=0..3
+        XCTAssertEqual(w[0], 0, accuracy: 1e-6)
+        XCTAssertEqual(w[1], 0.5, accuracy: 1e-6)
+        XCTAssertEqual(w[2], 1.0, accuracy: 1e-6)
+        XCTAssertEqual(w[3], 0.5, accuracy: 1e-6)
+    }
+
+    func testMelBasisShape() {
+        let basis = CosyVoice3PromptMel.buildSlaneyMelBasis(
+            sampleRate: 24_000, nFFT: 1920, numMels: 80, fMin: 0, fMax: 12_000)
+        XCTAssertEqual(basis.count, 80 * (1920 / 2 + 1))
+        // Each triangle should integrate to >0.
+        let numFreqBins = 1920 / 2 + 1
+        for m in 0..<80 {
+            var sum: Float = 0
+            for f in 0..<numFreqBins {
+                sum += basis[m * numFreqBins + f]
+            }
+            XCTAssertGreaterThan(sum, 0, "mel \(m) has zero sum")
+        }
+    }
+
+    func testTrimToTokenRatio() throws {
+        let mel = [Float](repeating: 1, count: 6 * 80)  // 6 frames
+        let (trimmed, frames) = try CosyVoice3PromptMel.trimToTokenRatio(
+            mel: mel, frames: 6, tokenCount: 2)
+        XCTAssertEqual(frames, 4)
+        XCTAssertEqual(trimmed.count, 4 * 80)
+    }
+
+    func testTrimToTokenRatioThrowsIfTooShort() {
+        let mel = [Float](repeating: 1, count: 2 * 80)
+        XCTAssertThrowsError(
+            try CosyVoice3PromptMel.trimToTokenRatio(
+                mel: mel, frames: 2, tokenCount: 2))
+    }
+}