From 27a6203334be735881f500ac630730c1b814d22b Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Fri, 24 Apr 2026 22:55:05 -0400 Subject: [PATCH 01/18] feat(tts/magpie): add NVIDIA Magpie TTS Multilingual 357M Swift port Ports the Magpie TTS Multilingual 357M autoregressive TTS from Python (mobius PR #24) to Swift, covering 8 languages (EN, ES, DE, FR, IT, VI, ZH, HI). Japanese is deferred pending OpenJTalk integration. Highlights: - Encoder-decoder transformer + NanoCodec vocoder, 22 kHz output. - 5 built-in speakers; `|...|` inline-IPA override routes phoneme tokens directly to the tokenizer for fine-grained pronunciation control. - 1-layer local transformer (256d) runs on CPU via Accelerate/BNNS with top-k + temperature sampling and audio-EOS / forbidden-token masking. - 12-layer decoder KV cache rolled statefully across `decoder_step` calls; optional `decoder_prefill` fast path for the speaker context. - Assets (4 CoreML models + constants/ + tokenizer/) auto-fetch from `FluidInference/magpie-tts-multilingual-357m-coreml` on first use. - New CLI: `fluidaudiocli magpie {download,text,parity,tokenizer-parity}`. - Public API: `MagpieTtsManager.downloadAndCreate(languages:)` actor. - Unit tests: IPA override segmentation, KV-cache shape, NeMo tokenizer parity, and NPY v1 fp16/fp32 reader (17 tests, all passing). --- README.md | 42 ++- Sources/FluidAudio/ModelNames.swift | 36 ++ .../Magpie/Assets/MagpieConstantsStore.swift | 241 ++++++++++++ .../MagpieLocalTransformerWeights.swift | 162 ++++++++ .../TTS/Magpie/Assets/MagpieModelStore.swift | 182 +++++++++ .../Assets/MagpieResourceDownloader.swift | 195 ++++++++++ .../MagpieLocalTransformer.swift | 298 +++++++++++++++ .../LocalTransformer/MagpieSampler.swift | 157 ++++++++ .../TTS/Magpie/MagpieConstants.swift | 122 ++++++ .../FluidAudio/TTS/Magpie/MagpieError.swift | 43 +++ .../TTS/Magpie/MagpieTtsManager.swift | 133 +++++++ .../FluidAudio/TTS/Magpie/MagpieTypes.swift | 115 ++++++ .../Preprocess/MagpieIpaOverride.swift | 73 ++++ .../Pipeline/Preprocess/MagpieTokenizer.swift | 129 +++++++ .../Tokenizers/MagpieCharTokenizer.swift | 43 +++ .../Tokenizers/MagpieMandarinTokenizer.swift | 149 ++++++++ .../Tokenizers/MagpiePhonemeTokenizer.swift | 173 +++++++++ .../Pipeline/Synthesize/MagpieKvCache.swift | 109 ++++++ .../Pipeline/Synthesize/MagpieNanocodec.swift | 66 ++++ .../Pipeline/Synthesize/MagpiePrefill.swift | 79 ++++ .../Synthesize/MagpieSynthesizer.swift | 304 +++++++++++++++ .../TTS/Magpie/Shared/NpyReader.swift | 289 ++++++++++++++ Sources/FluidAudio/TTS/TtsBackend.swift | 2 + .../Commands/MagpieCommand.swift | 355 ++++++++++++++++++ Sources/FluidAudioCLI/FluidAudioCLI.swift | 3 + .../TTS/Magpie/MagpieConstantsTests.swift | 46 +++ .../TTS/Magpie/MagpieIpaOverrideTests.swift | 53 +++ .../TTS/Magpie/MagpieKvCacheTests.swift | 48 +++ .../TTS/Magpie/MagpieNpyReaderTests.swift | 69 ++++ 29 files changed, 3715 insertions(+), 1 deletion(-) create mode 100644 Sources/FluidAudio/TTS/Magpie/Assets/MagpieConstantsStore.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Assets/MagpieLocalTransformerWeights.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Assets/MagpieModelStore.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Assets/MagpieResourceDownloader.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/LocalTransformer/MagpieLocalTransformer.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/LocalTransformer/MagpieSampler.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/MagpieConstants.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/MagpieError.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/MagpieTtsManager.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/MagpieTypes.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Pipeline/Preprocess/MagpieIpaOverride.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Pipeline/Preprocess/MagpieTokenizer.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Pipeline/Preprocess/Tokenizers/MagpieCharTokenizer.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Pipeline/Preprocess/Tokenizers/MagpieMandarinTokenizer.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Pipeline/Preprocess/Tokenizers/MagpiePhonemeTokenizer.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Pipeline/Synthesize/MagpieKvCache.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Pipeline/Synthesize/MagpieNanocodec.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Pipeline/Synthesize/MagpiePrefill.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Pipeline/Synthesize/MagpieSynthesizer.swift create mode 100644 Sources/FluidAudio/TTS/Magpie/Shared/NpyReader.swift create mode 100644 Sources/FluidAudioCLI/Commands/MagpieCommand.swift create mode 100644 Tests/FluidAudioTests/TTS/Magpie/MagpieConstantsTests.swift create mode 100644 Tests/FluidAudioTests/TTS/Magpie/MagpieIpaOverrideTests.swift create mode 100644 Tests/FluidAudioTests/TTS/Magpie/MagpieKvCacheTests.swift create mode 100644 Tests/FluidAudioTests/TTS/Magpie/MagpieNpyReaderTests.swift diff --git a/README.md b/README.md index d49573303..3f3aeee52 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Want to convert your own model? Check [möbius](https://github.com/FluidInferenc - **Automatic Speech Recognition (ASR)**: [Parakeet TDT v3](Documentation/Models.md#batch-transcription-near-real-time) (0.6b) and other TDT/CTC models for batch transcription supporting 25 European languages, Japanese, and Chinese; [Parakeet EOU](Documentation/Models.md#streaming-transcription-true-real-time) (120m) for streaming ASR with end-of-utterance detection (English only). See all [ASR models](Documentation/Models.md#asr-models). - **Inverse Text Normalization (ITN)**: Post-process ASR output to convert spoken-form to written-form ("two hundred" → "200"). See [text-processing-rs](https://github.com/FluidInference/text-processing-rs) -- **Text-to-Speech (TTS)**: Kokoro (82m) for parallel synthesis with SSML and pronunciation control across 9 languages (EN, ES, FR, HI, IT, JA, PT, ZH); PocketTTS for streaming TTS with voice cloning support (English only) +- **Text-to-Speech (TTS)**: Kokoro (82m) for parallel synthesis with SSML and pronunciation control across 9 languages (EN, ES, FR, HI, IT, JA, PT, ZH); PocketTTS for streaming TTS with voice cloning support (English only); Magpie (357m) autoregressive multilingual TTS with 5 speakers, `|…|` IPA override, and 8-language coverage (EN, ES, DE, FR, IT, VI, ZH, HI) - **Speaker Diarization (Online + Offline)**: Speaker separation and identification across audio streams. Streaming pipeline for real-time processing and offline batch pipeline with advanced clustering. - **Speaker Embedding Extraction**: Generate speaker embeddings for voice comparison and clustering, you can use this for speaker identification - **Voice Activity Detection (VAD)**: Voice activity detection with Silero models @@ -596,6 +596,46 @@ swift run fluidaudiocli tts "Hello from FluidAudio." --auto-download --output ou Dictionary and model assets are cached under `~/.cache/fluidaudio/Models/kokoro`. +### Magpie (Multilingual) + +Magpie TTS Multilingual (357M) is NVIDIA's autoregressive encoder-decoder TTS with 8-codebook NanoCodec vocoder output at 22.05 kHz. It exposes 5 built-in speakers and supports 8 languages (English, Spanish, German, French, Italian, Vietnamese, Mandarin, Hindi) with a `|…|` IPA override that routes inline phoneme sequences directly to the tokenizer. Japanese is deferred pending OpenJTalk integration. + +```swift +import FluidAudio + +Task { + let manager = try await MagpieTtsManager.downloadAndCreate( + languages: [.english, .spanish] + ) + let result = try await manager.synthesize( + text: "Hello | ˈ n ɛ m o ʊ | from FluidAudio.", + speaker: .john, + language: .english + ) + let wav = AudioWAV.data(from: result.samples, sampleRate: result.sampleRate) + try wav.write(to: URL(fileURLWithPath: "hello.wav")) +} +``` + +```bash +# Pre-download assets for selected languages +swift run fluidaudiocli magpie download --languages en,es + +# Synthesize with IPA override enabled (default) +swift run fluidaudiocli magpie text --text "Hello | ˈ n ɛ m o ʊ |." \ + --speaker 0 --language en --output hello.wav + +# Classifier-free guidance and sampling controls +swift run fluidaudiocli magpie text --text "Bonjour." --language fr \ + --cfg 1.3 --temperature 0.6 --topk 80 --seed 42 --output bonjour.wav + +# Fixture-driven parity harness (tokenizer / full pipeline) +swift run fluidaudiocli magpie tokenizer-parity --fixture fixture_en.json +swift run fluidaudiocli magpie parity --fixture fixture_en.npz +``` + +Assets (4 CoreML models + `constants/` + per-language tokenizer files) are fetched from [`FluidInference/magpie-tts-multilingual-357m-coreml`](https://huggingface.co/FluidInference/magpie-tts-multilingual-357m-coreml) on first use. The 1-layer local transformer (256d, top-k + temperature sampling, forbidden-token mask) runs on CPU via Accelerate/BNNS; the 12-layer decoder KV cache is rolled stateful across steps. + ## Continuous Integration - `tests.yml`: Default build matrix covering SwiftPM tests and an iOS archive smoke test. diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 69264524c..02f57132d 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -29,6 +29,7 @@ public enum Repo: String, CaseIterable, Sendable { case multilingualG2p = "FluidInference/charsiu-g2p-byt5-coreml" case parakeetTdtCtc110m = "FluidInference/parakeet-tdt-ctc-110m-coreml" case cohereTranscribeCoreml = "FluidInference/cohere-transcribe-03-2026-coreml/q8" + case magpieTts = "FluidInference/magpie-tts-multilingual-357m-coreml" /// Repository slug (without owner) public var name: String { @@ -81,6 +82,8 @@ public enum Repo: String, CaseIterable, Sendable { return "parakeet-tdt-ctc-110m-coreml" case .cohereTranscribeCoreml: return "cohere-transcribe-03-2026-coreml/q8" + case .magpieTts: + return "magpie-tts-multilingual-357m-coreml" } } @@ -171,6 +174,8 @@ public enum Repo: String, CaseIterable, Sendable { return "parakeet-tdt-ctc-110m" case .cohereTranscribeCoreml: return "cohere-transcribe/q8" + case .magpieTts: + return "magpie-tts" default: return name.replacingOccurrences(of: "-coreml", with: "") } @@ -591,6 +596,35 @@ public enum ModelNames { ] } + /// Magpie TTS Multilingual 357M model names. + /// + /// Four CoreML models + a `constants/` directory + a `tokenizer/` directory of + /// per-language lookup data. The `decoder_prefill` model is optional; when + /// absent the prefill runs step-by-step through `decoder_step`. + public enum Magpie { + public static let textEncoder = "text_encoder" + public static let decoderPrefill = "decoder_prefill" + public static let decoderStep = "decoder_step" + public static let nanocodecDecoder = "nanocodec_decoder" + + public static let textEncoderFile = textEncoder + ".mlmodelc" + public static let decoderPrefillFile = decoderPrefill + ".mlmodelc" + public static let decoderStepFile = decoderStep + ".mlmodelc" + public static let nanocodecDecoderFile = nanocodecDecoder + ".mlmodelc" + + public static let constantsDir = "constants" + public static let tokenizerDir = "tokenizer" + + /// Files required for English synthesis. Other languages append their own + /// lookup files on top (see `MagpieResourceDownloader`). + public static let requiredModels: Set = [ + textEncoderFile, + decoderStepFile, + nanocodecDecoderFile, + constantsDir, + ] + } + /// Multilingual G2P (CharsiuG2P ByT5) model names public enum MultilingualG2P { public static let encoder = "MultilingualG2PEncoder" @@ -760,6 +794,8 @@ public enum ModelNames { return ModelNames.MultilingualG2P.requiredModels case .cohereTranscribeCoreml: return ModelNames.CohereTranscribe.requiredModels + case .magpieTts: + return ModelNames.Magpie.requiredModels } } } diff --git a/Sources/FluidAudio/TTS/Magpie/Assets/MagpieConstantsStore.swift b/Sources/FluidAudio/TTS/Magpie/Assets/MagpieConstantsStore.swift new file mode 100644 index 000000000..fa7ad3cc0 --- /dev/null +++ b/Sources/FluidAudio/TTS/Magpie/Assets/MagpieConstantsStore.swift @@ -0,0 +1,241 @@ +import Foundation + +/// Decoded shape / hyperparameter metadata from `constants/constants.json`. +/// +/// The field names mirror the Python exporter +/// (`mobius/.../export_constants.py`). Unknown keys are ignored so the exporter +/// can add fields without breaking Swift. All fields have safe defaults matching +/// the published 357M checkpoint so the Swift port remains usable if a key is +/// dropped in a future rebuild. +public struct MagpieModelConfig: Sendable, Decodable { + public let dModel: Int + public let numDecoderLayers: Int + public let numHeads: Int + public let headDim: Int + public let numCodebooks: Int + public let numCodesPerCodebook: Int + public let maxCacheLength: Int + public let maxTextLength: Int + public let audioBosId: Int32 + public let audioEosId: Int32 + public let speakerContextLength: Int + + enum CodingKeys: String, CodingKey { + case dModel = "d_model" + case numDecoderLayers = "num_decoder_layers" + case numHeads = "num_heads" + case headDim = "head_dim" + case numCodebooks = "num_codebooks" + case numCodesPerCodebook = "num_codes_per_codebook" + case maxCacheLength = "max_cache_length" + case maxTextLength = "max_text_length" + case audioBosId = "audio_bos_id" + case audioEosId = "audio_eos_id" + case speakerContextLength = "speaker_context_length" + } + + public init(from decoder: Decoder) throws { + let c = try decoder.container(keyedBy: CodingKeys.self) + dModel = (try? c.decode(Int.self, forKey: .dModel)) ?? MagpieConstants.dModel + numDecoderLayers = + (try? c.decode(Int.self, forKey: .numDecoderLayers)) ?? MagpieConstants.numDecoderLayers + numHeads = (try? c.decode(Int.self, forKey: .numHeads)) ?? MagpieConstants.numHeads + headDim = (try? c.decode(Int.self, forKey: .headDim)) ?? MagpieConstants.headDim + numCodebooks = + (try? c.decode(Int.self, forKey: .numCodebooks)) ?? MagpieConstants.numCodebooks + numCodesPerCodebook = + (try? c.decode(Int.self, forKey: .numCodesPerCodebook)) + ?? MagpieConstants.numCodesPerCodebook + maxCacheLength = + (try? c.decode(Int.self, forKey: .maxCacheLength)) ?? MagpieConstants.maxCacheLength + maxTextLength = + (try? c.decode(Int.self, forKey: .maxTextLength)) ?? MagpieConstants.maxTextLength + audioBosId = (try? c.decode(Int32.self, forKey: .audioBosId)) ?? MagpieConstants.audioBosId + audioEosId = (try? c.decode(Int32.self, forKey: .audioEosId)) ?? MagpieConstants.audioEosId + speakerContextLength = + (try? c.decode(Int.self, forKey: .speakerContextLength)) + ?? MagpieConstants.speakerContextLength + } + + public init( + dModel: Int = MagpieConstants.dModel, + numDecoderLayers: Int = MagpieConstants.numDecoderLayers, + numHeads: Int = MagpieConstants.numHeads, + headDim: Int = MagpieConstants.headDim, + numCodebooks: Int = MagpieConstants.numCodebooks, + numCodesPerCodebook: Int = MagpieConstants.numCodesPerCodebook, + maxCacheLength: Int = MagpieConstants.maxCacheLength, + maxTextLength: Int = MagpieConstants.maxTextLength, + audioBosId: Int32 = MagpieConstants.audioBosId, + audioEosId: Int32 = MagpieConstants.audioEosId, + speakerContextLength: Int = MagpieConstants.speakerContextLength + ) { + self.dModel = dModel + self.numDecoderLayers = numDecoderLayers + self.numHeads = numHeads + self.headDim = headDim + self.numCodebooks = numCodebooks + self.numCodesPerCodebook = numCodesPerCodebook + self.maxCacheLength = maxCacheLength + self.maxTextLength = maxTextLength + self.audioBosId = audioBosId + self.audioEosId = audioEosId + self.speakerContextLength = speakerContextLength + } +} + +/// Decoded metadata from `constants/speaker_info.json`. +public struct MagpieSpeakerInfo: Sendable, Decodable { + public let contextLength: Int + public let dim: Int + public let names: [String] + + enum CodingKeys: String, CodingKey { + case contextLength = "context_length" + case dim = "dim" + case names = "names" + case T = "T" + case D = "D" + } + + public init(from decoder: Decoder) throws { + let c = try decoder.container(keyedBy: CodingKeys.self) + contextLength = + (try? c.decode(Int.self, forKey: .contextLength)) + ?? (try? c.decode(Int.self, forKey: .T)) + ?? MagpieConstants.speakerContextLength + dim = + (try? c.decode(Int.self, forKey: .dim)) + ?? (try? c.decode(Int.self, forKey: .D)) + ?? MagpieConstants.dModel + let decodedNames = (try? c.decode([String].self, forKey: .names)) ?? [] + if decodedNames.isEmpty { + names = MagpieSpeakerInfo.defaultNames + } else { + names = decodedNames + } + } + + /// Direct initializer used by the fallback path when `speaker_info.json` + /// is missing. Keeps us out of synthesizing fake Decoder instances. + public init( + contextLength: Int = MagpieConstants.speakerContextLength, + dim: Int = MagpieConstants.dModel, + names: [String] = MagpieSpeakerInfo.defaultNames + ) { + self.contextLength = contextLength + self.dim = dim + self.names = names + } + + public static let defaultNames: [String] = ["John", "Sofia", "Aria", "Jason", "Leo"] +} + +/// Loaded constants: config, speaker info, per-speaker embeddings (fp32), per-codebook +/// audio embeddings (fp32). All arrays are stored row-major. +public struct MagpieConstantsBundle: Sendable { + public let config: MagpieModelConfig + public let speakers: MagpieSpeakerInfo + /// Shape: [numSpeakers][contextLength × dModel]. Row-major. + public let speakerEmbeddings: [[Float]] + /// Shape: [numCodebooks][numCodesPerCodebook × dModel]. Row-major. + public let audioEmbeddings: [[Float]] + /// Text tokenizer EOS id (from `tokenizer_metadata.json`; 0 if absent). + public let textEosId: Int32 +} + +/// Loads Magpie constants from a directory (typically `/constants/`). +public enum MagpieConstantsLoader { + + private static let logger = AppLogger(category: "MagpieConstantsLoader") + + public static func load(from constantsDir: URL) throws -> MagpieConstantsBundle { + let config = try loadConfig(from: constantsDir) + let speakers = try loadSpeakerInfo(from: constantsDir) + + var speakerEmbeddings: [[Float]] = [] + speakerEmbeddings.reserveCapacity(MagpieConstants.numSpeakers) + for idx in 0.. Int32 { + let url = dir.appendingPathComponent(MagpieConstants.Files.tokenizerMetadataJson) + guard FileManager.default.fileExists(atPath: url.path), + let data = try? Data(contentsOf: url), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] + else { + return 0 + } + if let eos = json["eos_token_id"] as? Int { + return Int32(eos) + } + if let eos = json["text_eos_id"] as? Int { + return Int32(eos) + } + return 0 + } + + private static func loadConfig(from dir: URL) throws -> MagpieModelConfig { + let url = dir.appendingPathComponent(MagpieConstants.Files.constantsJson) + guard FileManager.default.fileExists(atPath: url.path) else { + logger.warning("constants.json missing; falling back to built-in defaults") + return MagpieModelConfig() + } + do { + let data = try Data(contentsOf: url) + return try JSONDecoder().decode(MagpieModelConfig.self, from: data) + } catch { + throw MagpieError.invalidConstants("constants.json: \(error)") + } + } + + private static func loadSpeakerInfo(from dir: URL) throws -> MagpieSpeakerInfo { + let url = dir.appendingPathComponent(MagpieConstants.Files.speakerInfoJson) + guard FileManager.default.fileExists(atPath: url.path) else { + logger.warning("speaker_info.json missing; falling back to built-in defaults") + return MagpieSpeakerInfo() + } + do { + let data = try Data(contentsOf: url) + return try JSONDecoder().decode(MagpieSpeakerInfo.self, from: data) + } catch { + throw MagpieError.invalidConstants("speaker_info.json: \(error)") + } + } +} diff --git a/Sources/FluidAudio/TTS/Magpie/Assets/MagpieLocalTransformerWeights.swift b/Sources/FluidAudio/TTS/Magpie/Assets/MagpieLocalTransformerWeights.swift new file mode 100644 index 000000000..f5cc371a5 --- /dev/null +++ b/Sources/FluidAudio/TTS/Magpie/Assets/MagpieLocalTransformerWeights.swift @@ -0,0 +1,162 @@ +import Foundation + +/// Weights for the Swift-side 1-layer Local Transformer that samples the 8 +/// codebook tokens per frame. +/// +/// Shapes match the NumPy reference in `mobius/models/tts/magpie/coreml/generate_coreml.py` +/// (fn `local_transformer_forward`). All arrays are kept row-major fp32 so the +/// Accelerate + BNNS forward pass can consume them directly. +public struct MagpieLocalTransformerWeights: Sendable { + // Input projection: (localDim, dModel) weight + (localDim,) bias. + public let inProjWeight: [Float] + public let inProjBias: [Float] + /// Positional embedding slots: (maxPositions, localDim). + public let posEmbedding: [Float] + /// RMSNorm / LayerNorm weights: (localDim,) each. + public let norm1Weight: [Float] + public let norm2Weight: [Float] + /// Self-attention QKV weight: (3*localDim, localDim). + public let saQkvWeight: [Float] + /// Self-attention output weight: (localDim, localDim). + public let saOWeight: [Float] + /// FFN conv kernel=1: (ffnDim, localDim) then (localDim, ffnDim). + public let ffnConv1Weight: [Float] + public let ffnConv2Weight: [Float] + /// Per-codebook output heads: 8× (numCodesPerCodebook, localDim) + (numCodesPerCodebook,). + public let outProjWeights: [[Float]] + public let outProjBiases: [[Float]] + + // Cached dimensions for convenience. + public let localDim: Int + public let dModel: Int + public let ffnDim: Int + public let maxPositions: Int + public let numCodebooks: Int + public let numCodesPerCodebook: Int +} + +public enum MagpieLocalTransformerLoader { + + private static let logger = AppLogger(category: "MagpieLocalTransformerLoader") + + /// Loads all `local_transformer/*.npy` files from `constantsDir`. + public static func load( + from constantsDir: URL, + config: MagpieModelConfig + ) throws -> MagpieLocalTransformerWeights { + let ltDir = constantsDir.appendingPathComponent(MagpieConstants.Files.localTransformerDir) + guard FileManager.default.fileExists(atPath: ltDir.path) else { + throw MagpieError.modelFileNotFound(MagpieConstants.Files.localTransformerDir) + } + + let localDim = MagpieConstants.localTransformerDim + let ffnDim = MagpieConstants.localTransformerFfnDim + let maxPositions = MagpieConstants.localTransformerMaxPositions + let dModel = config.dModel + let numCodebooks = config.numCodebooks + let numCodesPerCodebook = config.numCodesPerCodebook + + func loadNpy(_ name: String, expecting shape: [Int]) throws -> [Float] { + let url = ltDir.appendingPathComponent(name) + guard FileManager.default.fileExists(atPath: url.path) else { + throw MagpieError.modelFileNotFound("\(MagpieConstants.Files.localTransformerDir)/\(name)") + } + let array = try NpyReader.read(from: url) + try array.assertShape(shape, label: name) + return array.data + } + + let inProjWeight = try loadNpy( + MagpieConstants.Files.LocalTransformer.inProjWeight, + expecting: [localDim, dModel]) + let inProjBias = try loadNpy( + MagpieConstants.Files.LocalTransformer.inProjBias, + expecting: [localDim]) + let posEmbedding = try loadNpy( + MagpieConstants.Files.LocalTransformer.posEmb, + expecting: [maxPositions, localDim]) + let norm1Weight = try loadNpy( + MagpieConstants.Files.LocalTransformer.norm1Weight, + expecting: [localDim]) + let norm2Weight = try loadNpy( + MagpieConstants.Files.LocalTransformer.norm2Weight, + expecting: [localDim]) + let saQkvWeight = try loadNpy( + MagpieConstants.Files.LocalTransformer.saQkvWeight, + expecting: [3 * localDim, localDim]) + let saOWeight = try loadNpy( + MagpieConstants.Files.LocalTransformer.saOWeight, + expecting: [localDim, localDim]) + // Conv1d kernel=1 is effectively (out, in) matmul; the exporter keeps + // the trailing kernel dim so we accept either [out, in] or [out, in, 1]. + let ffnConv1Weight = try loadFlexible( + name: MagpieConstants.Files.LocalTransformer.ffnConv1Weight, + directory: ltDir, + primary: [ffnDim, localDim], + alternate: [ffnDim, localDim, 1]) + let ffnConv2Weight = try loadFlexible( + name: MagpieConstants.Files.LocalTransformer.ffnConv2Weight, + directory: ltDir, + primary: [localDim, ffnDim], + alternate: [localDim, ffnDim, 1]) + + var outProjWeights: [[Float]] = [] + var outProjBiases: [[Float]] = [] + outProjWeights.reserveCapacity(numCodebooks) + outProjBiases.reserveCapacity(numCodebooks) + for cb in 0.. [Float] { + let url = directory.appendingPathComponent(name) + guard FileManager.default.fileExists(atPath: url.path) else { + throw MagpieError.modelFileNotFound( + "\(MagpieConstants.Files.localTransformerDir)/\(name)") + } + let array = try NpyReader.read(from: url) + if array.shape == primary || array.shape == alternate { + return array.data + } + throw MagpieError.invalidNpyFile( + path: name, + reason: "expected shape \(primary) or \(alternate), got \(array.shape)") + } +} diff --git a/Sources/FluidAudio/TTS/Magpie/Assets/MagpieModelStore.swift b/Sources/FluidAudio/TTS/Magpie/Assets/MagpieModelStore.swift new file mode 100644 index 000000000..8c5c2a777 --- /dev/null +++ b/Sources/FluidAudio/TTS/Magpie/Assets/MagpieModelStore.swift @@ -0,0 +1,182 @@ +@preconcurrency import CoreML +import Foundation + +/// Actor-based store for Magpie CoreML models + constants + LocalTransformer weights. +/// +/// Manages loading of 3 required models (text_encoder, decoder_step, nanocodec_decoder) +/// and 1 optional model (decoder_prefill). Also holds the pre-loaded +/// `MagpieConstantsBundle` and `MagpieLocalTransformerWeights` so the synthesizer +/// can hit all assets from a single entry point. +public actor MagpieModelStore { + + private let logger = AppLogger(category: "MagpieModelStore") + + private var textEncoderModel: MLModel? + private var decoderPrefillModel: MLModel? // optional fast path + private var decoderStepModel: MLModel? + private var nanocodecDecoderModel: MLModel? + + private var constantsBundle: MagpieConstantsBundle? + private var localTransformerWeights: MagpieLocalTransformerWeights? + + private var repoDirectory: URL? + + private let directory: URL? + private let computeUnits: MLComputeUnits + private let preferredLanguages: Set + + /// - Parameters: + /// - directory: Optional override for the base cache directory. + /// - computeUnits: CoreML compute preference for all models. + /// - preferredLanguages: Set of languages whose tokenizer data should be fetched. + public init( + directory: URL? = nil, + computeUnits: MLComputeUnits = .cpuAndNeuralEngine, + preferredLanguages: Set = [.english] + ) { + self.directory = directory + self.computeUnits = computeUnits + self.preferredLanguages = preferredLanguages + } + + /// Download (if missing) and load all Magpie CoreML models + constants. + public func loadIfNeeded() async throws { + if textEncoderModel != nil { + return + } + + let repoDir = try await MagpieResourceDownloader.ensureAssets( + languages: preferredLanguages, + directory: directory, + includePrefill: true + ) + self.repoDirectory = repoDir + + logger.info("Loading Magpie CoreML models from \(repoDir.path)…") + + let config = MLModelConfiguration() + config.computeUnits = computeUnits + + let loadStart = Date() + + textEncoderModel = try loadModel( + repoDir: repoDir, + fileName: ModelNames.Magpie.textEncoderFile, + config: config, + required: true) + + decoderStepModel = try loadModel( + repoDir: repoDir, + fileName: ModelNames.Magpie.decoderStepFile, + config: config, + required: true) + + nanocodecDecoderModel = try loadModel( + repoDir: repoDir, + fileName: ModelNames.Magpie.nanocodecDecoderFile, + config: config, + required: true) + + decoderPrefillModel = try loadModel( + repoDir: repoDir, + fileName: ModelNames.Magpie.decoderPrefillFile, + config: config, + required: false) + + let elapsed = Date().timeIntervalSince(loadStart) + logger.info( + "Magpie models loaded in \(String(format: "%.2f", elapsed))s (prefill \(decoderPrefillModel == nil ? "absent" : "present"))" + ) + + // Load constants + local transformer weights. + let constantsDir = MagpieResourceDownloader.constantsDirectory(in: repoDir) + let bundle = try MagpieConstantsLoader.load(from: constantsDir) + constantsBundle = bundle + localTransformerWeights = try MagpieLocalTransformerLoader.load( + from: constantsDir, config: bundle.config) + } + + public func textEncoder() throws -> MLModel { + guard let model = textEncoderModel else { + throw MagpieError.notInitialized + } + return model + } + + public func decoderStep() throws -> MLModel { + guard let model = decoderStepModel else { + throw MagpieError.notInitialized + } + return model + } + + public func nanocodecDecoder() throws -> MLModel { + guard let model = nanocodecDecoderModel else { + throw MagpieError.notInitialized + } + return model + } + + public func decoderPrefill() -> MLModel? { + decoderPrefillModel + } + + public func constants() throws -> MagpieConstantsBundle { + guard let bundle = constantsBundle else { + throw MagpieError.notInitialized + } + return bundle + } + + public func localTransformer() throws -> MagpieLocalTransformerWeights { + guard let weights = localTransformerWeights else { + throw MagpieError.notInitialized + } + return weights + } + + public func repoDir() throws -> URL { + guard let dir = repoDirectory else { + throw MagpieError.notInitialized + } + return dir + } + + /// Release all loaded models + constants. Resource downloads on disk are kept. + public func unload() { + textEncoderModel = nil + decoderPrefillModel = nil + decoderStepModel = nil + nanocodecDecoderModel = nil + constantsBundle = nil + localTransformerWeights = nil + } + + // MARK: - Helpers + + private func loadModel( + repoDir: URL, fileName: String, config: MLModelConfiguration, required: Bool + ) throws -> MLModel? { + let modelURL = repoDir.appendingPathComponent(fileName) + guard FileManager.default.fileExists(atPath: modelURL.path) else { + if required { + throw MagpieError.modelFileNotFound(fileName) + } else { + logger.notice("Optional model \(fileName) not present; skipping") + return nil + } + } + do { + let model = try MLModel(contentsOf: modelURL, configuration: config) + logger.info("Loaded \(fileName)") + return model + } catch { + if required { + throw MagpieError.corruptedModel(fileName, underlying: "\(error)") + } else { + logger.warning("Failed to load optional \(fileName): \(error)") + return nil + } + } + } +} diff --git a/Sources/FluidAudio/TTS/Magpie/Assets/MagpieResourceDownloader.swift b/Sources/FluidAudio/TTS/Magpie/Assets/MagpieResourceDownloader.swift new file mode 100644 index 000000000..1462c28a4 --- /dev/null +++ b/Sources/FluidAudio/TTS/Magpie/Assets/MagpieResourceDownloader.swift @@ -0,0 +1,195 @@ +import Foundation + +/// Downloads Magpie TTS models, constants, and per-language tokenizer data from HuggingFace. +/// +/// The HF repo (`FluidInference/magpie-tts-multilingual-357m-coreml`) ships: +/// - 3 required CoreML models + 1 optional prefill model at the repo root +/// - `constants/` with model config, speaker embeddings, audio codebook tables, and +/// the local-transformer weights (downloaded as one subtree) +/// - `tokenizer/` with per-language lookup data (lazy per language) +public enum MagpieResourceDownloader { + + private static let logger = AppLogger(category: "MagpieResourceDownloader") + + /// Ensure the CoreML models + `constants/` directory are present locally, and + /// ensure tokenizer data for each requested language is present. Returns the + /// resolved repo directory (i.e. the root containing the `.mlmodelc` files). + public static func ensureAssets( + languages: Set = [.english], + directory: URL? = nil, + includePrefill: Bool = true, + progressHandler: DownloadUtils.ProgressHandler? = nil + ) async throws -> URL { + let modelsRoot = try directory ?? defaultCacheRoot() + let repoDir = modelsRoot.appendingPathComponent(Repo.magpieTts.folderName) + + let rootModelsPresent = ModelNames.Magpie.requiredModels.allSatisfy { entry in + FileManager.default.fileExists(atPath: repoDir.appendingPathComponent(entry).path) + } + + if !rootModelsPresent { + logger.info("Downloading Magpie TTS models from HuggingFace…") + try await DownloadUtils.downloadRepo( + .magpieTts, to: modelsRoot, progressHandler: progressHandler) + } else { + logger.info("Magpie TTS models found in cache") + } + + if includePrefill { + let prefillURL = repoDir.appendingPathComponent(ModelNames.Magpie.decoderPrefillFile) + if !FileManager.default.fileExists(atPath: prefillURL.path) { + logger.info("Fetching optional decoder_prefill model") + do { + try await DownloadUtils.downloadSubdirectory( + .magpieTts, + subdirectory: ModelNames.Magpie.decoderPrefillFile, + to: repoDir + ) + } catch { + logger.warning( + "decoder_prefill unavailable; falling back to step-by-step prefill: \(error)" + ) + } + } + } + + for language in languages { + try await ensureTokenizer(for: language, repoDirectory: repoDir) + } + + return repoDir + } + + /// Ensure tokenizer data for `language` exists. No-op for ByT5-only languages + /// (French, Italian, Vietnamese) since those use pure byte-level encoding. + public static func ensureTokenizer( + for language: MagpieLanguage, repoDirectory: URL + ) async throws { + let files = MagpieTokenizerFiles.files(for: language) + if files.isEmpty { return } + + let tokenizerDir = repoDirectory.appendingPathComponent(ModelNames.Magpie.tokenizerDir) + if !FileManager.default.fileExists(atPath: tokenizerDir.path) { + try FileManager.default.createDirectory( + at: tokenizerDir, withIntermediateDirectories: true) + } + + for file in files { + let localURL = tokenizerDir.appendingPathComponent(file) + if FileManager.default.fileExists(atPath: localURL.path) { continue } + + let remotePath = "\(ModelNames.Magpie.tokenizerDir)/\(file)" + logger.info("Downloading Magpie tokenizer file: \(remotePath)") + let remoteURL: URL + do { + remoteURL = try ModelRegistry.resolveModel(Repo.magpieTts.remotePath, remotePath) + } catch { + throw MagpieError.downloadFailed( + "failed to resolve HF URL for \(remotePath): \(error)") + } + + do { + let data = try await AssetDownloader.fetchData( + from: remoteURL, + description: "magpie tokenizer \(file)", + logger: logger + ) + try data.write(to: localURL, options: [.atomic]) + } catch { + // Certain files (e.g. German heteronyms) are optional — if the remote + // says 404 we log and move on; callers detect missing files at load time. + throw MagpieError.tokenizerDataMissing( + language: language.rawValue, file: file) + } + } + } + + /// Return the directory that holds the compiled `.mlmodelc` bundles (for loading). + public static func modelDirectory(in repoDirectory: URL) -> URL { + repoDirectory + } + + /// Return the directory that holds constants (JSON + npy + local_transformer/). + public static func constantsDirectory(in repoDirectory: URL) -> URL { + repoDirectory.appendingPathComponent(ModelNames.Magpie.constantsDir) + } + + /// Return the directory that holds per-language tokenizer lookups. + public static func tokenizerDirectory(in repoDirectory: URL) -> URL { + repoDirectory.appendingPathComponent(ModelNames.Magpie.tokenizerDir) + } + + private static func defaultCacheRoot() throws -> URL { + let base: URL + #if os(macOS) + base = FileManager.default.homeDirectoryForCurrentUser + .appendingPathComponent(".cache") + #else + guard + let first = FileManager.default.urls(for: .cachesDirectory, in: .userDomainMask).first + else { + throw MagpieError.downloadFailed("failed to locate caches directory") + } + base = first + #endif + let root = base.appendingPathComponent("fluidaudio").appendingPathComponent("Models") + if !FileManager.default.fileExists(atPath: root.path) { + try FileManager.default.createDirectory(at: root, withIntermediateDirectories: true) + } + return root + } +} + +/// Authoritative list of per-language tokenizer files. The emitters in +/// `mobius/models/tts/magpie/export_tokenizers.py` produce these names; the Swift +/// tokenizers consume them. +public enum MagpieTokenizerFiles { + /// Tokenizer filenames emitted by + /// `mobius/models/tts/magpie/coreml/export_tokenizers.py`. The naming convention + /// is `{tokenizer_name}_{suffix}.json` where `tokenizer_name` follows the NeMo + /// AggregatedTTSTokenizer names (e.g. `english_phoneme`, `french_chartokenizer`). + public static func files(for language: MagpieLanguage) -> [String] { + let base = tokenizerName(for: language) + switch language { + case .english, .spanish, .italian, .vietnamese: + // IPA G2P: token2id + phoneme_dict. + return ["\(base)_token2id.json", "\(base)_phoneme_dict.json"] + case .german: + // IPA G2P with heteronym fallback. + return [ + "\(base)_token2id.json", + "\(base)_phoneme_dict.json", + "\(base)_heteronyms.json", + ] + case .french, .hindi: + // Char-based tokenizers: only token2id lookup. + return ["\(base)_token2id.json"] + case .mandarin: + // pypinyin (phrase + char) + tone / letter / token2id maps. + return [ + "\(base)_token2id.json", + "\(base)_pinyin_dict.json", + "\(base)_tone_dict.json", + "\(base)_ascii_letter_dict.json", + "mandarin_pypinyin_char_dict.json", + "mandarin_pypinyin_phrase_dict.json", + "mandarin_jieba_dict.json", + ] + } + } + + /// NeMo tokenizer name for the given language (matches the Python map in + /// `generate_coreml._tokenize_text`). + public static func tokenizerName(for language: MagpieLanguage) -> String { + switch language { + case .english: return "english_phoneme" + case .spanish: return "spanish_phoneme" + case .german: return "german_phoneme" + case .italian: return "italian_phoneme" + case .vietnamese: return "vietnamese_phoneme" + case .mandarin: return "mandarin_phoneme" + case .french: return "french_chartokenizer" + case .hindi: return "hindi_chartokenizer" + } + } +} diff --git a/Sources/FluidAudio/TTS/Magpie/LocalTransformer/MagpieLocalTransformer.swift b/Sources/FluidAudio/TTS/Magpie/LocalTransformer/MagpieLocalTransformer.swift new file mode 100644 index 000000000..d0e1040d2 --- /dev/null +++ b/Sources/FluidAudio/TTS/Magpie/LocalTransformer/MagpieLocalTransformer.swift @@ -0,0 +1,298 @@ +import Accelerate +import Foundation + +/// Swift-side 1-layer Local Transformer forward pass. +/// +/// Mirrors `local_transformer_forward` in +/// `mobius/models/tts/magpie/coreml/generate_coreml.py` (lines 108–155): +/// pre-norm causal self-attention → pre-norm FFN with tanh-GELU. Single attention +/// head, localDim=256. Uses BLAS (`cblas_sgemm`) for every matmul so the AR loop +/// stays cache-resident. +/// +/// The transformer is stateless across frames — each call to +/// `MagpieLocalTransformerSampler.sample(...)` rebuilds the sequence from the +/// current decoder hidden state and the 8 tokens sampled so far. +public struct MagpieLocalTransformer: Sendable { + + public let weights: MagpieLocalTransformerWeights + + public init(weights: MagpieLocalTransformerWeights) { + self.weights = weights + } + + /// Forward pass for a sequence of length `T` (T ≤ numCodebooks+2). + /// + /// - Parameter sequence: `[T * localDim]` row-major fp32 (input sequence + /// including positional embeddings yet to be added — this routine adds them). + /// Caller must supply `T` explicitly to avoid ambiguity on partial buffers. + /// - Returns: `[T * localDim]` row-major output. + public func forward(sequence: [Float], length T: Int) -> [Float] { + precondition(sequence.count >= T * weights.localDim, "sequence buffer too small") + precondition(T <= weights.maxPositions, "sequence length exceeds maxPositions") + + let D = weights.localDim + let ffnD = weights.ffnDim + + // x = sequence[:T*D] + posEmbedding[:T*D] + var x = Swift.Array(sequence.prefix(T * D)) + addPositional(into: &x, length: T) + + // ── Pre-norm causal self-attention ── + var xNorm = layerNorm(x, length: T, weight: weights.norm1Weight) + + // QKV = xNorm @ sa_qkv_weight.T (T,D) × (3D,D)ᵀ → (T, 3D) + var qkv = Swift.Array(repeating: 0, count: T * 3 * D) + matmulTransB( + a: xNorm, aRows: T, aCols: D, + b: weights.saQkvWeight, bRows: 3 * D, bCols: D, + out: &qkv) + + // Split QKV into Q, K, V (each T × D) + var q = Swift.Array(repeating: 0, count: T * D) + var k = Swift.Array(repeating: 0, count: T * D) + var v = Swift.Array(repeating: 0, count: T * D) + for t in 0...size) + memcpy(&k[dstOff], Swift.Array(qkv[(srcOff + D)..<(srcOff + 2 * D)]), D * MemoryLayout.size) + memcpy(&v[dstOff], Swift.Array(qkv[(srcOff + 2 * D)..<(srcOff + 3 * D)]), D * MemoryLayout.size) + } + + // attn = Q @ Kᵀ * scale (T × T) + var attn = Swift.Array(repeating: 0, count: T * T) + matmulTransB( + a: q, aRows: T, aCols: D, + b: k, bRows: T, bCols: D, + out: &attn) + let scale = Float(1.0 / sqrt(Double(D))) + var scaleVar = scale + vDSP_vsmul(attn, 1, &scaleVar, &attn, 1, vDSP_Length(T * T)) + + // Causal mask + softmax + for t in 0.. t (future). Then softmax over [0, t]. + var maxVal: Float = -.infinity + for j in 0...t { + if attn[t * T + j] > maxVal { maxVal = attn[t * T + j] } + } + var denom: Float = 0 + for j in 0.. 0 { + let invDenom = 1.0 / denom + for j in 0...t { + attn[t * T + j] *= invDenom + } + } + } + + // saOut = attn @ V (T × T) × (T × D) → (T × D) + var saOut = Swift.Array(repeating: 0, count: T * D) + matmul( + a: attn, aRows: T, aCols: T, + b: v, bRows: T, bCols: D, + out: &saOut) + + // saOut = saOut @ sa_o_weight.T (T, D) × (D, D)ᵀ → (T, D) + var saProj = Swift.Array(repeating: 0, count: T * D) + matmulTransB( + a: saOut, aRows: T, aCols: D, + b: weights.saOWeight, bRows: D, bCols: D, + out: &saProj) + + // x += saProj + vDSP_vadd(x, 1, saProj, 1, &x, 1, vDSP_Length(T * D)) + + // ── Pre-norm FFN ── + xNorm = layerNorm(x, length: T, weight: weights.norm2Weight) + + // h = gelu(xNorm @ ffn_conv1_weight.T) → (T, ffnD) + var h = Swift.Array(repeating: 0, count: T * ffnD) + matmulTransB( + a: xNorm, aRows: T, aCols: D, + b: weights.ffnConv1Weight, bRows: ffnD, bCols: D, + out: &h) + applyGeluTanh(into: &h) + + // x += h @ ffn_conv2_weight.T → (T, D) + var ffnOut = Swift.Array(repeating: 0, count: T * D) + matmulTransB( + a: h, aRows: T, aCols: ffnD, + b: weights.ffnConv2Weight, bRows: D, bCols: ffnD, + out: &ffnOut) + vDSP_vadd(x, 1, ffnOut, 1, &x, 1, vDSP_Length(T * D)) + + return x + } + + /// Project a (dModel,) decoder hidden state through the input projection + /// → (localDim,). Used by the sampler to seed the LT sequence. + public func projectInput(hidden: [Float]) -> [Float] { + precondition(hidden.count == weights.dModel) + let D = weights.localDim + var out = weights.inProjBias // copy bias + // out += inProjWeight @ hidden (localDim, dModel) × (dModel,) → (localDim,) + inProjWeightApply(hidden: hidden, accumulate: &out) + _ = D + return out + } + + /// Compute logits for codebook `cb`: last-timestep out_proj head. + public func codebookLogits(lastHidden: [Float], codebook: Int) -> [Float] { + precondition(lastHidden.count == weights.localDim) + let numCodes = weights.numCodesPerCodebook + var logits = weights.outProjBiases[codebook] // copy bias (numCodes,) + // logits += outProjWeights[codebook] @ lastHidden (numCodes, localDim) × (localDim,) + let w = weights.outProjWeights[codebook] + w.withUnsafeBufferPointer { wPtr in + lastHidden.withUnsafeBufferPointer { hPtr in + logits.withUnsafeMutableBufferPointer { outPtr in + cblas_sgemv( + CblasRowMajor, CblasNoTrans, + Int32(numCodes), Int32(weights.localDim), + 1.0, + wPtr.baseAddress, Int32(weights.localDim), + hPtr.baseAddress, 1, + 1.0, + outPtr.baseAddress, 1) + } + } + } + return logits + } + + // MARK: - Private helpers + + private func addPositional(into buffer: inout [Float], length T: Int) { + let D = weights.localDim + let count = T * D + var tmp = buffer + weights.posEmbedding.withUnsafeBufferPointer { posPtr in + tmp.withUnsafeMutableBufferPointer { dstPtr in + // Only use first T rows of posEmbedding. + vDSP_vadd( + dstPtr.baseAddress!, 1, + posPtr.baseAddress!, 1, + dstPtr.baseAddress!, 1, + vDSP_Length(count)) + } + } + buffer = tmp + } + + private func layerNorm(_ x: [Float], length T: Int, weight: [Float]) -> [Float] { + let D = weights.localDim + var out = Swift.Array(repeating: 0, count: T * D) + let eps: Float = 1e-5 + for t in 0..(repeating: 0, count: D) + vDSP_vsadd(row, 1, &negMean, ¢ered, 1, vDSP_Length(D)) + var variance: Float = 0 + var sqr = Swift.Array(repeating: 0, count: D) + vDSP_vsq(centered, 1, &sqr, 1, vDSP_Length(D)) + vDSP_meanv(sqr, 1, &variance, vDSP_Length(D)) + let invStd = 1.0 / sqrt(variance + eps) + var invStdVar = invStd + var normed = Swift.Array(repeating: 0, count: D) + vDSP_vsmul(centered, 1, &invStdVar, &normed, 1, vDSP_Length(D)) + // Multiply by weight elementwise. + vDSP_vmul(normed, 1, weight, 1, &normed, 1, vDSP_Length(D)) + for i in 0..