Skip to content
Open
4 changes: 2 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import PackageDescription
let package = Package(
name: "FluidAudio",
platforms: [
.macOS(.v14),
.iOS(.v17),
.macOS(.v15),
.iOS(.v18),
],
products: [
.library(
Expand Down
48 changes: 48 additions & 0 deletions Sources/FluidAudio/ModelNames.swift
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ public enum Repo: String, CaseIterable, Sendable {
case qwen3AsrInt8 = "FluidInference/qwen3-asr-0.6b-coreml/int8"
case multilingualG2p = "FluidInference/charsiu-g2p-byt5-coreml"
case parakeetTdtCtc110m = "FluidInference/parakeet-tdt-ctc-110m-coreml"
case cosyvoice3 = "FluidInference/CosyVoice3-0.5B-coreml"

/// Repository slug (without owner)
public var name: String {
Expand Down Expand Up @@ -75,6 +76,8 @@ public enum Repo: String, CaseIterable, Sendable {
return "charsiu-g2p-byt5-coreml"
case .parakeetTdtCtc110m:
return "parakeet-tdt-ctc-110m-coreml"
case .cosyvoice3:
return "CosyVoice3-0.5B-coreml"
}
}

Expand Down Expand Up @@ -159,6 +162,8 @@ public enum Repo: String, CaseIterable, Sendable {
return "parakeet-ja"
case .parakeetTdtCtc110m:
return "parakeet-tdt-ctc-110m"
case .cosyvoice3:
return "cosyvoice3"
default:
return name.replacingOccurrences(of: "-coreml", with: "")
}
Expand Down Expand Up @@ -560,6 +565,47 @@ public enum ModelNames {
]
}

/// CosyVoice3 (Mandarin) model names. Files live on HuggingFace at
/// `FluidInference/CosyVoice3-0.5B-coreml` (see `Repo.cosyvoice3`). The
/// expected local directory layout is encoded in `CosyVoice3Constants.Files`.
public enum CosyVoice3 {
public static let llmPrefill = "LLM-Prefill-T256-M768-fp16"
public static let llmDecode = "LLM-Decode-M768-fp16-stateful"
public static let flow = "Flow-N250-fp16"
public static let hift = "HiFT-T500-fp16"
public static let speechEmbeddings = "speech_embedding-fp16.safetensors"

public static let llmPrefillFile = llmPrefill + ".mlmodelc"
public static let llmDecodeFile = llmDecode + ".mlmodelc"
public static let flowFile = flow + ".mlmodelc"
public static let hiftFile = hift + ".mlmodelc"

public static let requiredModels: Set<String> = [
llmPrefillFile,
llmDecodeFile,
flowFile,
hiftFile,
]

/// Sidecar assets living under subdirectories of the HF repo (not part
/// of `requiredModels`; pulled via `downloadSubdirectory` / direct file
/// fetch by `CosyVoice3ResourceDownloader`).
public enum Sidecar {
public static let embeddingsDir = "embeddings"
public static let tokenizerDir = "tokenizer"
public static let voicesDir = "voices"

public static let speechEmbeddings = "speech_embedding-fp16.safetensors"
public static let runtimeEmbeddings = "embeddings-runtime-fp32.safetensors"
public static let specialTokens = "special_tokens.json"
public static let vocab = "vocab.json"
public static let merges = "merges.txt"
public static let tokenizerConfig = "tokenizer_config.json"

public static let defaultVoiceId = "cosyvoice3-default-zh"
}
}

/// Multilingual G2P (CharsiuG2P ByT5) model names
public enum MultilingualG2P {
public static let encoder = "MultilingualG2PEncoder"
Expand Down Expand Up @@ -688,6 +734,8 @@ public enum ModelNames {
return ModelNames.Qwen3ASR.requiredModelsFull
case .multilingualG2p:
return ModelNames.MultilingualG2P.requiredModels
case .cosyvoice3:
return ModelNames.CosyVoice3.requiredModels
}
}
}
186 changes: 186 additions & 0 deletions Sources/FluidAudio/TTS/CosyVoice3/Assets/CosyVoice3ModelStore.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
@preconcurrency import CoreML
import Foundation

/// Actor-based store for the four CosyVoice3 CoreML models.
///
/// Two on-disk layouts are accepted:
///
/// 1. **HuggingFace cache** (flat): `<dir>/<ModelName>.mlmodelc` (or
/// `.mlpackage`) at repo root, with `<dir>/embeddings/speech_embedding-fp16.safetensors`.
/// This is what `CosyVoice3ResourceDownloader` produces.
///
/// 2. **Local mobius build dir**: `<dir>/<subdir>/<ModelName>.mlpackage` as
/// emitted by `models/tts/cosyvoice3/coreml/convert-coreml.py` (with
/// `llm-fp16/`, `flow-fp16-n250/`, `hift-fp16-t500/` subdirs).
///
/// The store probes layout (1) first, then falls back to (2). CoreML
/// auto-compiles `.mlpackage` on first load and caches the compiled bundle on
/// disk.
public actor CosyVoice3ModelStore {

private let logger = AppLogger(subsystem: "com.fluidaudio.tts", category: "CosyVoice3ModelStore")

public nonisolated let directory: URL
private let computeUnits: MLComputeUnits

private var loadedModels: CosyVoice3Models?
private var speechEmbeddingsURL: URL?

/// - Parameters:
/// - directory: Base build directory that contains
/// `llm-fp16/`, `llm-fp16-stateful/`, `flow-fp16-n250/`,
/// `hift-fp16-t500/`, `embeddings/`.
/// - computeUnits: Defaults to `.cpuAndNeuralEngine`. Applied to
/// LLM-Prefill + HiFT models only. LLM-Decode (stateful) and Flow
/// both force `.cpuAndGPU` regardless (see `loadIfNeeded()`).
public init(directory: URL, computeUnits: MLComputeUnits = .cpuAndNeuralEngine) {
self.directory = directory
self.computeUnits = computeUnits
}

/// Load all four CoreML models. Idempotent.
public func loadIfNeeded() async throws {
guard loadedModels == nil else { return }

let config = MLModelConfiguration()
config.computeUnits = computeUnits

let loadStart = Date()
logger.info("Loading CosyVoice3 CoreML models from \(directory.path)...")

let prefillURL = try resolveModel(
subdir: CosyVoice3Constants.Files.llmPrefillSubdir,
baseName: ModelNames.CosyVoice3.llmPrefill)
let decodeURL = try resolveModel(
subdir: CosyVoice3Constants.Files.llmDecodeSubdir,
baseName: ModelNames.CosyVoice3.llmDecode)
let flowURL = try resolveModel(
subdir: CosyVoice3Constants.Files.flowSubdir,
baseName: ModelNames.CosyVoice3.flow)
let hiftURL = try resolveModel(
subdir: CosyVoice3Constants.Files.hiftSubdir,
baseName: ModelNames.CosyVoice3.hift)
let embeddingsURL = try resolveAsset(
subdir: CosyVoice3Constants.Files.speechEmbeddingsSubdir,
file: CosyVoice3Constants.Files.speechEmbeddings)

let prefill = try await compileAndLoad(prefillURL, configuration: config)
logger.info("Loaded \(CosyVoice3Constants.Files.llmPrefill)")

// Stateful decode MUST run on `.cpuAndGPU`:
// - ANE refuses to compile the stateful graph (same failure mode
// as Flow: `MILCompilerForANE ANECCompile() FAILED`), so
// `.cpuAndNE` / `.all` deadlock load
// - CPU-only works but is ~2× slower than the GPU path
// Ignore the user-supplied `computeUnits` for decode.
let decodeConfig = MLModelConfiguration()
decodeConfig.computeUnits = .cpuAndGPU
let decode = try await compileAndLoad(decodeURL, configuration: decodeConfig)
logger.info("Loaded \(CosyVoice3Constants.Files.llmDecode)")

// Flow runs on `.cpuAndGPU` (fp16). An ANE-port attempt (BC1S
// rewrite: Linear→Conv2d(1×1), LayerNorm on axis=1, manual SDPA,
// pre-baked rotary sin/cos) produced a Flow that *compiled* and
// ran ~3× faster, but numerically broken: on the parity
// fixture the ANE graph collapses the mel dynamic range from
// [-12.5, +5.2] to [-10.1, -0.8] (MAE 2.58 vs PyTorch fp32;
// plan required <1e-3), yielding HiFT audio at ~40× lower peak
// amplitude — unintelligible to both CTC-ZH and Qwen3 ASR.
// Reverted to the cpuAndGPU fp16 baseline. See
// `coreml/TRIALS_AND_ERRORS.md` "Flow ANE port" for the full
// journey including the residual 77-op `conv_pos_embed` CPU
// island that may have been masking the dynamic-range
// compression introduced elsewhere in the BC1S rewrite.
// Ignore the user-supplied `computeUnits` for Flow; apply it to
// the LLM + HiFT models only.
let flowConfig = MLModelConfiguration()
flowConfig.computeUnits = .cpuAndGPU
let flow = try await compileAndLoad(flowURL, configuration: flowConfig)
logger.info("Loaded \(CosyVoice3Constants.Files.flow)")

let hift = try await compileAndLoad(hiftURL, configuration: config)
logger.info("Loaded \(CosyVoice3Constants.Files.hift)")

loadedModels = CosyVoice3Models(prefill: prefill, decode: decode, flow: flow, hift: hift)
speechEmbeddingsURL = embeddingsURL

let elapsed = Date().timeIntervalSince(loadStart)
logger.info("All CosyVoice3 models loaded in \(String(format: "%.2f", elapsed))s")
}

public func models() throws -> CosyVoice3Models {
guard let models = loadedModels else {
throw CosyVoice3Error.notInitialized
}
return models
}

public func speechEmbeddingsFileURL() throws -> URL {
guard let url = speechEmbeddingsURL else {
throw CosyVoice3Error.notInitialized
}
return url
}

// MARK: - Helpers

/// Resolve a CoreML model accepting either `.mlmodelc` or `.mlpackage`
/// extensions and both layouts: flat (HF) or subdir (local build).
private func resolveModel(subdir: String, baseName: String) throws -> URL {
let candidates: [URL] = [
// HF flat layout prefers the precompiled .mlmodelc.
directory.appendingPathComponent("\(baseName).mlmodelc"),
directory.appendingPathComponent("\(baseName).mlpackage"),
// Local build layout (mobius convert-coreml.py output).
directory.appendingPathComponent(subdir).appendingPathComponent("\(baseName).mlmodelc"),
directory.appendingPathComponent(subdir).appendingPathComponent("\(baseName).mlpackage"),
]
for url in candidates where FileManager.default.fileExists(atPath: url.path) {
return url
}
let probed = candidates.map { $0.path }.joined(separator: ", ")
throw CosyVoice3Error.modelFileNotFound(probed)
}

/// Resolve a plain sidecar file (e.g. `speech_embedding-fp16.safetensors`).
/// Probes `<dir>/<subdir>/<file>` then `<dir>/<file>`.
private func resolveAsset(subdir: String, file: String) throws -> URL {
let candidates: [URL] = [
directory.appendingPathComponent(subdir).appendingPathComponent(file),
directory.appendingPathComponent(file),
]
for url in candidates where FileManager.default.fileExists(atPath: url.path) {
return url
}
let probed = candidates.map { $0.path }.joined(separator: ", ")
throw CosyVoice3Error.modelFileNotFound(probed)
}

/// Compile an .mlpackage to .mlmodelc (cached in a persistent temp dir
/// next to the original package) and load it. Skips compilation if an
/// already-compiled .mlmodelc exists next to the package.
private func compileAndLoad(
_ url: URL,
configuration: MLModelConfiguration
) async throws -> MLModel {
if url.pathExtension == "mlmodelc" {
return try MLModel(contentsOf: url, configuration: configuration)
}
let base = url.deletingPathExtension().lastPathComponent
let compiledName = base + ".mlmodelc"
let cached = url.deletingLastPathComponent().appendingPathComponent(compiledName)
if FileManager.default.fileExists(atPath: cached.path) {
return try MLModel(contentsOf: cached, configuration: configuration)
}
let compiledURL = try await MLModel.compileModel(at: url)
// Move into place next to the package so subsequent loads are fast.
try? FileManager.default.removeItem(at: cached)
do {
try FileManager.default.moveItem(at: compiledURL, to: cached)
return try MLModel(contentsOf: cached, configuration: configuration)
} catch {
// If the move fails (e.g. cross-device), load from the temp URL.
return try MLModel(contentsOf: compiledURL, configuration: configuration)
}
}
}
Loading