diff --git a/CHANGELOG.md b/CHANGELOG.md
index f84eec4..983b7fb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,10 @@
- **`./agui` subpath export** — `agui-stub.ts` wired into `tsdown.config.ts` and `package.json` exports, matching `./a2a`, `./mcp`, `./vector` pattern.
- **Complete AG-UI type exports** — All event types (reasoning, step, thinking, raw, custom, chunk), `AGUIBuildOpts`, `matchesAGUIFixture`, `AGUIReasoningEncryptedValueSubtype`, `AGUIMessageRole` now exported from package root.
- **`"warn"` log level** — New log level between `"silent"` and `"info"` with proper hierarchy. `AGUIMockOptions.logLevel` option added; AG-UI mock defaults to `"warn"` instead of `"silent"`.
+- **Non-speech audio generation** — Mock support for ElevenLabs sound effects (`/v1/sound-generation`) and music (`/v1/music/*`), fal.ai queue-based audio (`/fal/queue/submit/*`, `/fal/queue/requests/*`, `/fal/run/*`), Gemini HTTP audio via `generateContent`/`streamGenerateContent` with `inlineData` audio parts, and Gemini Live WebSocket audio. Convenience methods: `onAudio()`, `onSoundEffect()`, `onMusic()`, `onFalAudio()`. (PR #140, closes #118)
+- **AudioResponse broadened** — `audio` field now supports both `string` (base64) and `{ b64Json, contentType }` object form
+- **Gemini audio recording** — Record and replay Gemini audio responses (both streaming SSE and non-streaming JSON)
+- **Router audio-gen filtering** — Bidirectional endpoint filtering for `audio-gen` and `fal-audio` endpoint types
### Fixed
diff --git a/docs/index.html b/docs/index.html
index 536f024..cf1b9e1 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -1547,8 +1547,8 @@
Chaos Testing
🎨
Multimedia APIs
- Image generation, text-to-speech, audio transcription, and video generation —
- mock every multimedia endpoint with fixtures.
+ Image generation, text-to-speech, audio transcription, non-speech audio generation,
+ and video generation — mock every multimedia endpoint with fixtures.
@@ -1680,7 +1680,7 @@ How aimock compares
| Multi-provider support |
- 12 providers ✓ |
+ 13 providers ✓ |
manual |
12 providers |
OpenAI only |
@@ -1732,6 +1732,15 @@ How aimock compares
✗ |
✗ |
+
+ | Non-speech audio |
+ Built-in ✓ |
+ ✗ |
+ ✗ |
+ ✗ |
+ ✗ |
+ ✗ |
+
| Video generation |
Built-in ✓ |
diff --git a/scripts/update-competitive-matrix.ts b/scripts/update-competitive-matrix.ts
index 0f80e6d..6283b40 100644
--- a/scripts/update-competitive-matrix.ts
+++ b/scripts/update-competitive-matrix.ts
@@ -91,6 +91,18 @@ const FEATURE_RULES: FeatureRule[] = [
"transcription api",
],
},
+ {
+ rowLabel: "Non-speech audio",
+ keywords: [
+ "sound-generation",
+ "sound effect",
+ "music generation",
+ "elevenlabs",
+ "fal.ai",
+ "audio generation",
+ "non-speech audio",
+ ],
+ },
{
rowLabel: "Video generation",
keywords: ["sora", "/v1/videos", "video generation", "generate.*video"],
@@ -236,6 +248,7 @@ function countProviders(text: string): number {
["openai"],
["claude", "anthropic"],
["gemini", "google.*ai"],
+ ["gemini.*interactions"],
["bedrock", "aws"],
["azure"],
["vertex"],
diff --git a/src/__tests__/competitive-matrix.test.ts b/src/__tests__/competitive-matrix.test.ts
index bbbcd3d..9a9367e 100644
--- a/src/__tests__/competitive-matrix.test.ts
+++ b/src/__tests__/competitive-matrix.test.ts
@@ -56,6 +56,18 @@ const FEATURE_RULES: FeatureRule[] = [
rowLabel: "Image generation",
keywords: ["dall-e", "dalle", "/v1/images", "image generation", "imagen", "generate.*image"],
},
+ {
+ rowLabel: "Non-speech audio",
+ keywords: [
+ "sound-generation",
+ "sound effect",
+ "music generation",
+ "elevenlabs",
+ "fal.ai",
+ "audio generation",
+ "non-speech audio",
+ ],
+ },
{
rowLabel: "Video generation",
keywords: ["sora", "/v1/videos", "video generation", "generate.*video"],
diff --git a/src/__tests__/elevenlabs-audio.test.ts b/src/__tests__/elevenlabs-audio.test.ts
new file mode 100644
index 0000000..c76ae85
--- /dev/null
+++ b/src/__tests__/elevenlabs-audio.test.ts
@@ -0,0 +1,229 @@
+import { describe, test, expect, afterEach } from "vitest";
+import { LLMock } from "../llmock.js";
+
+describe("ElevenLabs sound generation", () => {
+ let mock: LLMock;
+
+ afterEach(async () => {
+ await mock?.stop();
+ });
+
+ test("sound generation with string-form audio returns binary", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "castle door opening", endpoint: "audio-gen" },
+ response: { audio: "SGVsbG8=", format: "mp3" },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/sound-generation`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({ text: "castle door opening" }),
+ });
+
+ expect(res.status).toBe(200);
+ expect(res.headers.get("content-type")).toBe("audio/mpeg");
+ const buffer = await res.arrayBuffer();
+ expect(buffer.byteLength).toBeGreaterThan(0);
+ // "SGVsbG8=" decodes to "Hello" (5 bytes)
+ expect(buffer.byteLength).toBe(5);
+ });
+
+ test("sound generation with object-form audio", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "explosion", endpoint: "audio-gen" },
+ response: { audio: { b64Json: "SGVsbG8=", contentType: "audio/wav" } },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/sound-generation`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({ text: "explosion" }),
+ });
+
+ expect(res.status).toBe(200);
+ expect(res.headers.get("content-type")).toBe("audio/wav");
+ const buffer = await res.arrayBuffer();
+ expect(buffer.byteLength).toBe(5);
+ });
+
+ test("missing text field returns 400", async () => {
+ mock = new LLMock({ port: 0 });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/sound-generation`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({}),
+ });
+
+ expect(res.status).toBe(400);
+ const data = await res.json();
+ expect(data.error.message).toContain("text");
+ });
+
+ test("no matching fixture returns 404", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "specific sound", endpoint: "audio-gen" },
+ response: { audio: "SGVsbG8=" },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/sound-generation`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({ text: "completely different sound" }),
+ });
+
+ expect(res.status).toBe(404);
+ });
+
+ test("error fixture returns error status", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "rate limited", endpoint: "audio-gen" },
+ response: { error: { message: "rate limit", type: "rate_limit_error" }, status: 429 },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/sound-generation`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({ text: "rate limited" }),
+ });
+
+ expect(res.status).toBe(429);
+ const data = await res.json();
+ expect(data.error.message).toBe("rate limit");
+ });
+});
+
+describe("ElevenLabs music", () => {
+ let mock: LLMock;
+
+ afterEach(async () => {
+ await mock?.stop();
+ });
+
+ test("music compose returns binary audio with song-id header", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "upbeat piano", endpoint: "audio-gen" },
+ response: { audio: "SGVsbG8=", format: "mp3" },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/music`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({ prompt: "upbeat piano" }),
+ });
+
+ expect(res.status).toBe(200);
+ expect(res.headers.get("content-type")).toBe("audio/mpeg");
+ expect(res.headers.get("song-id")).toBeTruthy();
+ expect(res.headers.get("song-id")).toMatch(/^mock-song-/);
+ const buffer = await res.arrayBuffer();
+ expect(buffer.byteLength).toBe(5);
+ });
+
+ test("music stream returns binary audio", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "ambient drone", endpoint: "audio-gen" },
+ response: { audio: "SGVsbG8=" },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/music/stream`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({ prompt: "ambient drone" }),
+ });
+
+ expect(res.status).toBe(200);
+ const buffer = await res.arrayBuffer();
+ expect(buffer.byteLength).toBe(5);
+ });
+
+ test("music plan returns JSON text", async () => {
+ mock = new LLMock({ port: 0 });
+ const compositionPlan = JSON.stringify({ sections: ["intro", "verse", "chorus"] });
+ mock.addFixture({
+ match: { userMessage: "jazz song", endpoint: "audio-gen" },
+ response: { content: compositionPlan },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/music/plan`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({ prompt: "jazz song" }),
+ });
+
+ expect(res.status).toBe(200);
+ expect(res.headers.get("content-type")).toBe("application/json");
+ const data = await res.json();
+ expect(data.sections).toEqual(["intro", "verse", "chorus"]);
+ });
+
+ test("missing prompt returns 400 for music", async () => {
+ mock = new LLMock({ port: 0 });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/music`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({}),
+ });
+
+ expect(res.status).toBe(400);
+ const data = await res.json();
+ expect(data.error.message).toContain("prompt");
+ });
+});
+
+describe("ElevenLabs convenience methods", () => {
+ let mock: LLMock;
+
+ afterEach(async () => {
+ await mock?.stop();
+ });
+
+ test("onSoundEffect creates fixture with correct endpoint", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onSoundEffect("door", { audio: "SGVsbG8=" });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/sound-generation`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({ text: "door" }),
+ });
+
+ expect(res.status).toBe(200);
+ const buffer = await res.arrayBuffer();
+ expect(buffer.byteLength).toBe(5);
+ });
+
+ test("onMusic creates fixture with correct endpoint", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onMusic("piano", { audio: "SGVsbG8=" });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1/music`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", Authorization: "Bearer test" },
+ body: JSON.stringify({ prompt: "piano" }),
+ });
+
+ expect(res.status).toBe(200);
+ expect(res.headers.get("song-id")).toBeTruthy();
+ const buffer = await res.arrayBuffer();
+ expect(buffer.byteLength).toBe(5);
+ });
+});
diff --git a/src/__tests__/fal-audio.test.ts b/src/__tests__/fal-audio.test.ts
new file mode 100644
index 0000000..cc33aeb
--- /dev/null
+++ b/src/__tests__/fal-audio.test.ts
@@ -0,0 +1,272 @@
+import { describe, test, expect, afterEach } from "vitest";
+import { LLMock } from "../llmock.js";
+
+describe("fal.ai audio queue", () => {
+ let mock: LLMock;
+
+ afterEach(async () => {
+ await mock?.stop();
+ });
+
+ test("queue submit returns queue envelope", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("drum loop", { audio: "SGVsbG8=", format: "mp3" });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "drum loop" }),
+ });
+ expect(res.status).toBe(200);
+ const data = await res.json();
+ expect(data.request_id).toBeDefined();
+ expect(typeof data.request_id).toBe("string");
+ expect(data.response_url).toContain(data.request_id);
+ expect(data.status_url).toContain(data.request_id);
+ expect(data.cancel_url).toContain(data.request_id);
+ expect(data.queue_position).toBe(0);
+ });
+
+ test("queue status returns COMPLETED", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("drum loop", { audio: "SGVsbG8=", format: "mp3" });
+ await mock.start();
+
+ // Submit first
+ const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "drum loop" }),
+ });
+ const envelope = await submit.json();
+
+ // Check status
+ const status = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}/status`);
+ expect(status.status).toBe(200);
+ const statusData = await status.json();
+ expect(statusData.status).toBe("COMPLETED");
+ expect(statusData.request_id).toBe(envelope.request_id);
+ expect(statusData.response_url).toBeDefined();
+ });
+
+ test("queue result returns audio file object", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("drum loop", { audio: "SGVsbG8=", format: "mp3" });
+ await mock.start();
+
+ // Submit
+ const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "drum loop" }),
+ });
+ const envelope = await submit.json();
+
+ // Get result
+ const result = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}`);
+ expect(result.status).toBe(200);
+ const data = await result.json();
+ expect(data.audio).toBeDefined();
+ expect(data.audio.url).toContain("generated_audio.mp3");
+ expect(data.audio.content_type).toBe("audio/mpeg");
+ expect(data.audio.file_name).toBe("generated_audio.mp3");
+ expect(typeof data.audio.file_size).toBe("number");
+ expect(data.audio.file_size).toBeGreaterThan(0);
+ });
+
+ test("full queue lifecycle: submit -> status -> result", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("synth pad", { audio: "AAAA", format: "wav" });
+ await mock.start();
+
+ // Submit
+ const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "synth pad" }),
+ });
+ expect(submit.status).toBe(200);
+ const envelope = await submit.json();
+ const requestId = envelope.request_id;
+
+ // Status
+ const status = await fetch(`${mock.url}/fal/queue/requests/${requestId}/status`);
+ expect(status.status).toBe(200);
+ const statusData = await status.json();
+ expect(statusData.status).toBe("COMPLETED");
+
+ // Result
+ const result = await fetch(`${mock.url}/fal/queue/requests/${requestId}`);
+ expect(result.status).toBe(200);
+ const resultData = await result.json();
+ expect(resultData.audio.url).toContain("generated_audio.wav");
+ expect(resultData.audio.content_type).toBe("audio/wav");
+ });
+
+ test("synchronous run returns result directly", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("drum loop", { audio: "SGVsbG8=", format: "mp3" });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/fal/run/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "drum loop" }),
+ });
+ expect(res.status).toBe(200);
+ const data = await res.json();
+ // Synchronous run returns result directly, no queue envelope
+ expect(data.audio).toBeDefined();
+ expect(data.audio.url).toContain("generated_audio.mp3");
+ expect(data.audio.content_type).toBe("audio/mpeg");
+ expect(data.request_id).toBeUndefined(); // no queue envelope
+ });
+
+ test("cancel returns ALREADY_COMPLETED", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("drum loop", { audio: "SGVsbG8=" });
+ await mock.start();
+
+ // Submit
+ const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "drum loop" }),
+ });
+ const envelope = await submit.json();
+
+ // Cancel
+ const cancel = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}/cancel`, {
+ method: "PUT",
+ });
+ expect(cancel.status).toBe(400);
+ const cancelData = await cancel.json();
+ expect(cancelData.status).toBe("ALREADY_COMPLETED");
+ });
+
+ test("unknown request_id returns 404", async () => {
+ mock = new LLMock({ port: 0 });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/fal/queue/requests/nonexistent/status`);
+ expect(res.status).toBe(404);
+ });
+
+ test("object-form audio response with contentType", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("speech", {
+ audio: { b64Json: "SGVsbG8=", contentType: "audio/wav" },
+ });
+ await mock.start();
+
+ // Submit
+ const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "speech" }),
+ });
+ const envelope = await submit.json();
+
+ // Get result
+ const result = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}`);
+ const data = await result.json();
+ expect(data.audio.content_type).toBe("audio/wav");
+ });
+
+ test("onFalAudio convenience method registers fixture correctly", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("loop", { audio: "SGVsbG8=" });
+ await mock.start();
+
+ const submit = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "loop" }),
+ });
+ expect(submit.status).toBe(200);
+ const envelope = await submit.json();
+ expect(envelope.request_id).toBeDefined();
+
+ // Verify result is retrievable
+ const result = await fetch(`${mock.url}/fal/queue/requests/${envelope.request_id}`);
+ expect(result.status).toBe(200);
+ const data = await result.json();
+ expect(data.audio).toBeDefined();
+ });
+
+ test("no matching fixture returns 404", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("specific prompt", { audio: "SGVsbG8=" });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "completely different" }),
+ });
+ expect(res.status).toBe(404);
+ const data = await res.json();
+ expect(data.error.message).toContain("No fixture matched");
+ });
+
+ test("error fixture returns error status", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "quota", endpoint: "fal-audio" },
+ response: { error: { message: "quota exceeded", type: "rate_limit" }, status: 429 },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({ prompt: "quota" }),
+ });
+ expect(res.status).toBe(429);
+ const data = await res.json();
+ expect(data.error.message).toBe("quota exceeded");
+ });
+
+ test("X-Test-Id isolation for fal queue jobs", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onFalAudio("drum", { audio: "SGVsbG8=" });
+ await mock.start();
+
+ // Submit with test-id A
+ const submitA = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", "X-Test-Id": "testA" },
+ body: JSON.stringify({ prompt: "drum" }),
+ });
+ const envelopeA = await submitA.json();
+
+ // Submit with test-id B
+ const submitB = await fetch(`${mock.url}/fal/queue/submit/fal-ai/stable-audio`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json", "X-Test-Id": "testB" },
+ body: JSON.stringify({ prompt: "drum" }),
+ });
+ const envelopeB = await submitB.json();
+
+ // A's request_id should not be visible to B
+ const crossLookup = await fetch(
+ `${mock.url}/fal/queue/requests/${envelopeA.request_id}/status`,
+ { headers: { "X-Test-Id": "testB" } },
+ );
+ expect(crossLookup.status).toBe(404);
+
+ // A's request_id should be visible to A
+ const sameLookup = await fetch(
+ `${mock.url}/fal/queue/requests/${envelopeA.request_id}/status`,
+ { headers: { "X-Test-Id": "testA" } },
+ );
+ expect(sameLookup.status).toBe(200);
+
+ // B's request_id should be visible to B
+ const bLookup = await fetch(`${mock.url}/fal/queue/requests/${envelopeB.request_id}/status`, {
+ headers: { "X-Test-Id": "testB" },
+ });
+ expect(bLookup.status).toBe(200);
+ });
+});
diff --git a/src/__tests__/gemini-audio-record.test.ts b/src/__tests__/gemini-audio-record.test.ts
new file mode 100644
index 0000000..880a465
--- /dev/null
+++ b/src/__tests__/gemini-audio-record.test.ts
@@ -0,0 +1,391 @@
+import { describe, it, expect } from "vitest";
+import * as http from "node:http";
+import * as fs from "node:fs";
+import * as os from "node:os";
+import * as path from "node:path";
+import { proxyAndRecord } from "../recorder.js";
+import type { Fixture, RecordConfig, ChatCompletionRequest } from "../types.js";
+import { Logger } from "../logger.js";
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function createUpstream(
+ handler: (req: http.IncomingMessage, res: http.ServerResponse) => void,
+): Promise<{ server: http.Server; url: string }> {
+ return new Promise((resolve) => {
+ const server = http.createServer(handler);
+ server.listen(0, "127.0.0.1", () => {
+ const addr = server.address() as { port: number };
+ resolve({ server, url: `http://127.0.0.1:${addr.port}` });
+ });
+ });
+}
+
+function closeServer(server: http.Server): Promise {
+ return new Promise((resolve) => server.close(() => resolve()));
+}
+
+function createMockReqRes(
+ urlPath: string,
+ headers: Record = {},
+): { req: http.IncomingMessage; res: http.ServerResponse; getResponse: () => Promise } {
+ const chunks: Buffer[] = [];
+ let statusCode = 200;
+
+ const req = {
+ method: "POST",
+ url: urlPath,
+ headers: { "content-type": "application/json", ...headers },
+ } as unknown as http.IncomingMessage;
+
+ const res = {
+ statusCode,
+ headersSent: false,
+ destroyed: false,
+ writableEnded: false,
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
+ writeHead(status: number, hdrs?: Record) {
+ statusCode = status;
+ res.statusCode = status;
+ (res as unknown as { headersSent: boolean }).headersSent = true;
+ },
+ write(data: string | Buffer) {
+ chunks.push(Buffer.isBuffer(data) ? data : Buffer.from(data));
+ return true;
+ },
+ end(data?: string | Buffer) {
+ if (data) chunks.push(Buffer.isBuffer(data) ? data : Buffer.from(data));
+ (res as unknown as { writableEnded: boolean }).writableEnded = true;
+ },
+ setHeader() {},
+ flushHeaders() {},
+ on() {
+ return res;
+ },
+ } as unknown as http.ServerResponse;
+
+ return {
+ req,
+ res,
+ getResponse: async () => Buffer.concat(chunks).toString(),
+ };
+}
+
+function makeTmpDir(): string {
+ return fs.mkdtempSync(path.join(os.tmpdir(), "aimock-gemini-audio-"));
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe("Gemini audio recording: non-streaming JSON", () => {
+ it("records non-streaming Gemini audio response as AudioResponse", async () => {
+ const fixturePath = makeTmpDir();
+ const { server, url } = await createUpstream((_req, res) => {
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ candidates: [
+ {
+ content: {
+ role: "model",
+ parts: [{ inlineData: { mimeType: "audio/mp3", data: "SGVsbG8=" } }],
+ },
+ finishReason: "STOP",
+ },
+ ],
+ }),
+ );
+ });
+
+ try {
+ const fixtures: Fixture[] = [];
+ const record: RecordConfig = { providers: { gemini: url }, fixturePath };
+ const logger = new Logger("silent");
+ const request: ChatCompletionRequest = {
+ model: "gemini-2.0-flash",
+ messages: [{ role: "user", content: "say hello" }],
+ };
+
+ const { req, res } = createMockReqRes("/v1beta/models/gemini-2.0-flash:generateContent");
+ await proxyAndRecord(
+ req,
+ res,
+ request,
+ "gemini",
+ "/v1beta/models/gemini-2.0-flash:generateContent",
+ fixtures,
+ { record, logger },
+ );
+
+ expect(fixtures).toHaveLength(1);
+ const response = fixtures[0].response as {
+ audio?: { b64Json: string; contentType?: string };
+ };
+ expect(response.audio).toBeDefined();
+ expect(response.audio!.b64Json).toBe("SGVsbG8=");
+ expect(response.audio!.contentType).toBe("audio/mp3");
+ } finally {
+ await closeServer(server);
+ fs.rmSync(fixturePath, { recursive: true, force: true });
+ }
+ });
+});
+
+describe("Gemini audio recording: streaming SSE", () => {
+ it("records streaming Gemini SSE audio response as AudioResponse", async () => {
+ const fixturePath = makeTmpDir();
+ const chunk1 = JSON.stringify({
+ candidates: [
+ {
+ content: {
+ parts: [{ inlineData: { mimeType: "audio/mp3", data: "AAAA" } }],
+ },
+ },
+ ],
+ });
+ const chunk2 = JSON.stringify({
+ candidates: [
+ {
+ content: {
+ parts: [{ inlineData: { mimeType: "audio/mp3", data: "BBBB" } }],
+ },
+ },
+ ],
+ });
+ const sseBody = `data: ${chunk1}\n\ndata: ${chunk2}\n\n`;
+
+ const { server, url } = await createUpstream((_req, res) => {
+ res.writeHead(200, { "Content-Type": "text/event-stream" });
+ res.end(sseBody);
+ });
+
+ try {
+ const fixtures: Fixture[] = [];
+ const record: RecordConfig = { providers: { gemini: url }, fixturePath };
+ const logger = new Logger("silent");
+ const request: ChatCompletionRequest = {
+ model: "gemini-2.0-flash",
+ messages: [{ role: "user", content: "stream audio" }],
+ stream: true,
+ };
+
+ const { req, res } = createMockReqRes(
+ "/v1beta/models/gemini-2.0-flash:streamGenerateContent",
+ );
+ await proxyAndRecord(
+ req,
+ res,
+ request,
+ "gemini",
+ "/v1beta/models/gemini-2.0-flash:streamGenerateContent",
+ fixtures,
+ { record, logger },
+ );
+
+ expect(fixtures).toHaveLength(1);
+ const response = fixtures[0].response as {
+ audio?: { b64Json: string; contentType?: string };
+ };
+ expect(response.audio).toBeDefined();
+ expect(response.audio!.b64Json).toBe("AAAABBBB");
+ expect(response.audio!.contentType).toBe("audio/mp3");
+ } finally {
+ await closeServer(server);
+ fs.rmSync(fixturePath, { recursive: true, force: true });
+ }
+ });
+});
+
+describe("Gemini audio recording: audio priority over text", () => {
+ it("audio parts take priority over text parts in non-streaming", async () => {
+ const fixturePath = makeTmpDir();
+ const { server, url } = await createUpstream((_req, res) => {
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ candidates: [
+ {
+ content: {
+ role: "model",
+ parts: [
+ { text: "Here is the audio" },
+ { inlineData: { mimeType: "audio/wav", data: "UklGRg==" } },
+ ],
+ },
+ finishReason: "STOP",
+ },
+ ],
+ }),
+ );
+ });
+
+ try {
+ const fixtures: Fixture[] = [];
+ const record: RecordConfig = { providers: { gemini: url }, fixturePath };
+ const logger = new Logger("silent");
+ const request: ChatCompletionRequest = {
+ model: "gemini-2.0-flash",
+ messages: [{ role: "user", content: "audio with text" }],
+ };
+
+ const { req, res } = createMockReqRes("/v1beta/models/gemini-2.0-flash:generateContent");
+ await proxyAndRecord(
+ req,
+ res,
+ request,
+ "gemini",
+ "/v1beta/models/gemini-2.0-flash:generateContent",
+ fixtures,
+ { record, logger },
+ );
+
+ expect(fixtures).toHaveLength(1);
+ const response = fixtures[0].response as {
+ audio?: { b64Json: string; contentType?: string };
+ content?: string;
+ };
+ // Audio should take priority — no content field
+ expect(response.audio).toBeDefined();
+ expect(response.audio!.b64Json).toBe("UklGRg==");
+ expect(response.audio!.contentType).toBe("audio/wav");
+ expect(response.content).toBeUndefined();
+ } finally {
+ await closeServer(server);
+ fs.rmSync(fixturePath, { recursive: true, force: true });
+ }
+ });
+});
+
+describe("Gemini audio recording: replay after record", () => {
+ it("recorded fixture matches on replay", async () => {
+ const fixturePath = makeTmpDir();
+ const { server, url } = await createUpstream((_req, res) => {
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ candidates: [
+ {
+ content: {
+ role: "model",
+ parts: [{ inlineData: { mimeType: "audio/mp3", data: "dGVzdA==" } }],
+ },
+ finishReason: "STOP",
+ },
+ ],
+ }),
+ );
+ });
+
+ try {
+ const fixtures: Fixture[] = [];
+ const record: RecordConfig = { providers: { gemini: url }, fixturePath };
+ const logger = new Logger("silent");
+ const request: ChatCompletionRequest = {
+ model: "gemini-2.0-flash",
+ messages: [{ role: "user", content: "replay test" }],
+ };
+
+ // First call: record
+ const { req: req1, res: res1 } = createMockReqRes(
+ "/v1beta/models/gemini-2.0-flash:generateContent",
+ );
+ await proxyAndRecord(
+ req1,
+ res1,
+ request,
+ "gemini",
+ "/v1beta/models/gemini-2.0-flash:generateContent",
+ fixtures,
+ { record, logger },
+ );
+
+ expect(fixtures).toHaveLength(1);
+ const recorded = fixtures[0];
+ expect(recorded.match.userMessage).toBe("replay test");
+
+ // The fixture is now in memory — verify its shape is correct for replay
+ const response = recorded.response as {
+ audio?: { b64Json: string; contentType?: string };
+ };
+ expect(response.audio).toBeDefined();
+ expect(response.audio!.b64Json).toBe("dGVzdA==");
+ expect(response.audio!.contentType).toBe("audio/mp3");
+
+ // Verify fixture was written to disk
+ const files = fs.readdirSync(fixturePath).filter((f) => f.endsWith(".json"));
+ expect(files.length).toBeGreaterThanOrEqual(1);
+
+ const diskFixture = JSON.parse(
+ fs.readFileSync(path.join(fixturePath, files[0]), "utf-8"),
+ ) as { fixtures: Array<{ response: { audio?: { b64Json: string; contentType?: string } } }> };
+ expect(diskFixture.fixtures[0].response.audio).toBeDefined();
+ expect(diskFixture.fixtures[0].response.audio!.b64Json).toBe("dGVzdA==");
+ } finally {
+ await closeServer(server);
+ fs.rmSync(fixturePath, { recursive: true, force: true });
+ }
+ });
+});
+
+describe("Gemini audio recording: non-audio inlineData is ignored", () => {
+ it("image inlineData does not produce AudioResponse", async () => {
+ const fixturePath = makeTmpDir();
+ const { server, url } = await createUpstream((_req, res) => {
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ candidates: [
+ {
+ content: {
+ role: "model",
+ parts: [
+ { text: "Here is an image" },
+ { inlineData: { mimeType: "image/png", data: "iVBORw0KGgo=" } },
+ ],
+ },
+ finishReason: "STOP",
+ },
+ ],
+ }),
+ );
+ });
+
+ try {
+ const fixtures: Fixture[] = [];
+ const record: RecordConfig = { providers: { gemini: url }, fixturePath };
+ const logger = new Logger("silent");
+ const request: ChatCompletionRequest = {
+ model: "gemini-2.0-flash",
+ messages: [{ role: "user", content: "show image" }],
+ };
+
+ const { req, res } = createMockReqRes("/v1beta/models/gemini-2.0-flash:generateContent");
+ await proxyAndRecord(
+ req,
+ res,
+ request,
+ "gemini",
+ "/v1beta/models/gemini-2.0-flash:generateContent",
+ fixtures,
+ { record, logger },
+ );
+
+ expect(fixtures).toHaveLength(1);
+ const response = fixtures[0].response as {
+ audio?: unknown;
+ content?: string;
+ };
+ // Should NOT be an AudioResponse — image/png is not audio/
+ expect(response.audio).toBeUndefined();
+ // Should fall through to text extraction
+ expect(response.content).toBe("Here is an image");
+ } finally {
+ await closeServer(server);
+ fs.rmSync(fixturePath, { recursive: true, force: true });
+ }
+ });
+});
diff --git a/src/__tests__/gemini-audio.test.ts b/src/__tests__/gemini-audio.test.ts
new file mode 100644
index 0000000..8b5c03b
--- /dev/null
+++ b/src/__tests__/gemini-audio.test.ts
@@ -0,0 +1,176 @@
+import { describe, test, expect, afterEach } from "vitest";
+import { LLMock } from "../llmock.js";
+
+describe("Gemini audio responses", () => {
+ let mock: LLMock;
+
+ afterEach(async () => {
+ await mock?.stop();
+ });
+
+ test("non-streaming generateContent with string-form audio", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "piano loop" },
+ response: { audio: "SGVsbG8=", format: "mp3" },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ contents: [{ role: "user", parts: [{ text: "piano loop" }] }],
+ }),
+ });
+ expect(res.status).toBe(200);
+ const data = await res.json();
+ expect(data.candidates[0].content.parts[0].inlineData).toEqual({
+ mimeType: "audio/mpeg",
+ data: "SGVsbG8=",
+ });
+ expect(data.candidates[0].finishReason).toBe("STOP");
+ expect(data.usageMetadata).toBeDefined();
+ });
+
+ test("streaming streamGenerateContent with string-form audio", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "piano loop" },
+ response: { audio: "SGVsbG8=", format: "mp3" },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1beta/models/lyria-3:streamGenerateContent`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ contents: [{ role: "user", parts: [{ text: "piano loop" }] }],
+ }),
+ });
+ expect(res.status).toBe(200);
+ expect(res.headers.get("content-type")).toBe("text/event-stream");
+
+ const text = await res.text();
+ // Parse SSE data lines
+ const chunks = text
+ .split("\n\n")
+ .filter((line) => line.startsWith("data: "))
+ .map((line) => JSON.parse(line.replace("data: ", "")));
+
+ expect(chunks.length).toBeGreaterThanOrEqual(1);
+ const chunk = chunks[0];
+ expect(chunk.candidates[0].content.parts[0].inlineData).toEqual({
+ mimeType: "audio/mpeg",
+ data: "SGVsbG8=",
+ });
+ expect(chunk.candidates[0].finishReason).toBe("STOP");
+ });
+
+ test("non-streaming with object-form audio", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "wav audio" },
+ response: { audio: { b64Json: "SGVsbG8=", contentType: "audio/wav" } },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ contents: [{ role: "user", parts: [{ text: "wav audio" }] }],
+ }),
+ });
+ const data = await res.json();
+ expect(data.candidates[0].content.parts[0].inlineData).toEqual({
+ mimeType: "audio/wav",
+ data: "SGVsbG8=",
+ });
+ });
+
+ test("object-form audio without contentType defaults to audio/mpeg", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "default format" },
+ response: { audio: { b64Json: "SGVsbG8=" } },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ contents: [{ role: "user", parts: [{ text: "default format" }] }],
+ }),
+ });
+ const data = await res.json();
+ expect(data.candidates[0].content.parts[0].inlineData.mimeType).toBe("audio/mpeg");
+ });
+
+ test("string-form audio with format opus", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "opus audio" },
+ response: { audio: "SGVsbG8=", format: "opus" },
+ });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ contents: [{ role: "user", parts: [{ text: "opus audio" }] }],
+ }),
+ });
+ const data = await res.json();
+ expect(data.candidates[0].content.parts[0].inlineData.mimeType).toBe("audio/opus");
+ });
+
+ test("Vertex AI path works too", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.addFixture({
+ match: { userMessage: "piano loop" },
+ response: { audio: "SGVsbG8=", format: "mp3" },
+ });
+ await mock.start();
+
+ const res = await fetch(
+ `${mock.url}/v1/projects/proj/locations/us-central1/publishers/google/models/lyria-3:generateContent`,
+ {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ contents: [{ role: "user", parts: [{ text: "piano loop" }] }],
+ }),
+ },
+ );
+ expect(res.status).toBe(200);
+ const data = await res.json();
+ expect(data.candidates[0].content.parts[0].inlineData).toEqual({
+ mimeType: "audio/mpeg",
+ data: "SGVsbG8=",
+ });
+ });
+
+ test("onAudio() convenience method works via Gemini", async () => {
+ mock = new LLMock({ port: 0 });
+ mock.onAudio("piano loop", { audio: "SGVsbG8=" });
+ await mock.start();
+
+ const res = await fetch(`${mock.url}/v1beta/models/lyria-3:generateContent`, {
+ method: "POST",
+ headers: { "Content-Type": "application/json" },
+ body: JSON.stringify({
+ contents: [{ role: "user", parts: [{ text: "piano loop" }] }],
+ }),
+ });
+ expect(res.status).toBe(200);
+ const data = await res.json();
+ // onAudio without format defaults to mp3
+ expect(data.candidates[0].content.parts[0].inlineData).toEqual({
+ mimeType: "audio/mpeg",
+ data: "SGVsbG8=",
+ });
+ });
+});
diff --git a/src/__tests__/multimedia-types.test.ts b/src/__tests__/multimedia-types.test.ts
index 1217ba2..bf70328 100644
--- a/src/__tests__/multimedia-types.test.ts
+++ b/src/__tests__/multimedia-types.test.ts
@@ -26,11 +26,36 @@ describe("multimedia type guards", () => {
expect(isImageResponse(r)).toBe(false);
});
- test("isAudioResponse detects audio", () => {
+ test("isAudioResponse detects audio (string form)", () => {
const r: FixtureResponse = { audio: "AAAA", format: "mp3" };
expect(isAudioResponse(r)).toBe(true);
});
+ test("isAudioResponse detects audio (object form with contentType)", () => {
+ const r: FixtureResponse = { audio: { b64Json: "abc", contentType: "audio/mp3" } };
+ expect(isAudioResponse(r)).toBe(true);
+ });
+
+ test("isAudioResponse detects audio (object form without contentType)", () => {
+ const r: FixtureResponse = { audio: { b64Json: "abc" } };
+ expect(isAudioResponse(r)).toBe(true);
+ });
+
+ test("isAudioResponse accepts empty b64Json (validation is in fixture-loader)", () => {
+ const r: FixtureResponse = { audio: { b64Json: "" } };
+ expect(isAudioResponse(r)).toBe(true);
+ });
+
+ test("isAudioResponse rejects numeric audio", () => {
+ const r = { audio: 123 } as unknown as FixtureResponse;
+ expect(isAudioResponse(r)).toBe(false);
+ });
+
+ test("isAudioResponse rejects object without b64Json", () => {
+ const r = { audio: { foo: "bar" } } as unknown as FixtureResponse;
+ expect(isAudioResponse(r)).toBe(false);
+ });
+
test("isAudioResponse rejects text response", () => {
const r: FixtureResponse = { content: "hello" };
expect(isAudioResponse(r)).toBe(false);
diff --git a/src/__tests__/ws-gemini-live-audio.test.ts b/src/__tests__/ws-gemini-live-audio.test.ts
new file mode 100644
index 0000000..669a198
--- /dev/null
+++ b/src/__tests__/ws-gemini-live-audio.test.ts
@@ -0,0 +1,188 @@
+import { describe, it, expect, afterEach } from "vitest";
+import { createServer, type ServerInstance } from "../server.js";
+import type { Fixture } from "../types.js";
+import { connectWebSocket } from "./ws-test-client.js";
+
+// --- helpers ---
+
+const GEMINI_WS_PATH =
+ "/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent";
+
+function setupMsg(model = "gemini-2.0-flash-exp"): string {
+ return JSON.stringify({
+ setup: { model },
+ });
+}
+
+function clientContentMsg(text: string): string {
+ return JSON.stringify({
+ clientContent: {
+ turns: [{ role: "user", parts: [{ text }] }],
+ turnComplete: true,
+ },
+ });
+}
+
+// --- tests ---
+
+let instance: ServerInstance | null = null;
+
+afterEach(async () => {
+ if (instance) {
+ await new Promise((resolve) => {
+ instance!.server.close(() => resolve());
+ });
+ instance = null;
+ }
+});
+
+describe("WebSocket Gemini Live — audio responses", () => {
+ it("returns audio inlineData for string-form AudioResponse", async () => {
+ const audioFixture: Fixture = {
+ match: { userMessage: "play-audio-string" },
+ response: { audio: "SGVsbG8=", format: "mp3" },
+ };
+ instance = await createServer([audioFixture]);
+ const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH);
+
+ ws.send(setupMsg());
+ await ws.waitForMessages(1); // setupComplete
+
+ ws.send(clientContentMsg("play-audio-string"));
+
+ const raw = await ws.waitForMessages(2); // setupComplete + audio response
+ const msg = JSON.parse(raw[1]);
+
+ expect(msg.serverContent).toBeDefined();
+ expect(msg.serverContent.modelTurn.parts).toHaveLength(1);
+ expect(msg.serverContent.modelTurn.parts[0].inlineData).toEqual({
+ mimeType: "audio/mpeg",
+ data: "SGVsbG8=",
+ });
+ expect(msg.serverContent.turnComplete).toBe(true);
+
+ ws.close();
+ });
+
+ it("returns audio inlineData for object-form AudioResponse with contentType", async () => {
+ const audioFixture: Fixture = {
+ match: { userMessage: "play-audio-object" },
+ response: { audio: { b64Json: "SGVsbG8=", contentType: "audio/wav" } },
+ };
+ instance = await createServer([audioFixture]);
+ const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH);
+
+ ws.send(setupMsg());
+ await ws.waitForMessages(1); // setupComplete
+
+ ws.send(clientContentMsg("play-audio-object"));
+
+ const raw = await ws.waitForMessages(2);
+ const msg = JSON.parse(raw[1]);
+
+ expect(msg.serverContent).toBeDefined();
+ expect(msg.serverContent.modelTurn.parts[0].inlineData).toEqual({
+ mimeType: "audio/wav",
+ data: "SGVsbG8=",
+ });
+ expect(msg.serverContent.turnComplete).toBe(true);
+
+ ws.close();
+ });
+
+ it("defaults to audio/mpeg when object-form AudioResponse omits contentType", async () => {
+ const audioFixture: Fixture = {
+ match: { userMessage: "play-audio-no-ct" },
+ response: { audio: { b64Json: "SGVsbG8=" } },
+ };
+ instance = await createServer([audioFixture]);
+ const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH);
+
+ ws.send(setupMsg());
+ await ws.waitForMessages(1); // setupComplete
+
+ ws.send(clientContentMsg("play-audio-no-ct"));
+
+ const raw = await ws.waitForMessages(2);
+ const msg = JSON.parse(raw[1]);
+
+ expect(msg.serverContent.modelTurn.parts[0].inlineData.mimeType).toBe("audio/mpeg");
+ expect(msg.serverContent.modelTurn.parts[0].inlineData.data).toBe("SGVsbG8=");
+ expect(msg.serverContent.turnComplete).toBe(true);
+
+ ws.close();
+ });
+
+ it("sends audio as a single frame with turnComplete: true (not chunked)", async () => {
+ const audioFixture: Fixture = {
+ match: { userMessage: "play-audio-single" },
+ response: { audio: "QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo=", format: "opus" },
+ };
+ instance = await createServer([audioFixture]);
+ const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH);
+
+ ws.send(setupMsg());
+ await ws.waitForMessages(1); // setupComplete
+
+ ws.send(clientContentMsg("play-audio-single"));
+
+ const raw = await ws.waitForMessages(2); // setupComplete + exactly 1 audio frame
+ // Only 2 messages total — setupComplete and the single audio response
+ expect(raw).toHaveLength(2);
+
+ const msg = JSON.parse(raw[1]);
+ expect(msg.serverContent).toBeDefined();
+ expect(msg.serverContent.turnComplete).toBe(true);
+ expect(msg.serverContent.modelTurn.parts[0].inlineData).toEqual({
+ mimeType: "audio/opus",
+ data: "QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVo=",
+ });
+
+ // Verify no additional messages arrive (wait briefly and confirm count stays at 2)
+ await new Promise((r) => setTimeout(r, 100));
+
+ ws.close();
+ });
+
+ it("client-sent inlineData parts do not crash the handler", async () => {
+ // Fixture matches on the text part, ignoring the inlineData part
+ const textFixture: Fixture = {
+ match: { userMessage: "transcribe-this" },
+ response: { content: "I heard your audio" },
+ };
+ instance = await createServer([textFixture]);
+ const ws = await connectWebSocket(instance.url, GEMINI_WS_PATH);
+
+ ws.send(setupMsg());
+ await ws.waitForMessages(1); // setupComplete
+
+ // Send clientContent with both an inlineData part (simulating audio input)
+ // and a text part for fixture matching
+ ws.send(
+ JSON.stringify({
+ clientContent: {
+ turns: [
+ {
+ role: "user",
+ parts: [
+ { inlineData: { mimeType: "audio/pcm", data: "dGVzdC1hdWRpbw==" } },
+ { text: "transcribe-this" },
+ ],
+ },
+ ],
+ turnComplete: true,
+ },
+ }),
+ );
+
+ const raw = await ws.waitForMessages(2);
+ const msg = JSON.parse(raw[1]);
+
+ // The handler should process the text part and return a text response
+ expect(msg.serverContent).toBeDefined();
+ expect(msg.serverContent.modelTurn.parts[0].text).toBe("I heard your audio");
+ expect(msg.serverContent.turnComplete).toBe(true);
+
+ ws.close();
+ });
+});
diff --git a/src/elevenlabs-audio.ts b/src/elevenlabs-audio.ts
new file mode 100644
index 0000000..49710fd
--- /dev/null
+++ b/src/elevenlabs-audio.ts
@@ -0,0 +1,268 @@
+import type http from "node:http";
+import type { ChatCompletionRequest, Fixture, HandlerDefaults } from "./types.js";
+import {
+ isAudioResponse,
+ isTextResponse,
+ isErrorResponse,
+ FORMAT_TO_CONTENT_TYPE,
+ getTestId,
+} from "./helpers.js";
+import { matchFixture } from "./router.js";
+import { writeErrorResponse } from "./sse-writer.js";
+import { proxyAndRecord } from "./recorder.js";
+import type { Journal } from "./journal.js";
+
+export async function handleElevenLabsAudio(
+ req: http.IncomingMessage,
+ res: http.ServerResponse,
+ body: string,
+ fixtures: Fixture[],
+ defaults: HandlerDefaults,
+ journal: Journal,
+ subType: string, // "sound-generation" | "music" | "stream" | "plan"
+): Promise {
+ const path = req.url ?? "/v1/sound-generation";
+ const method = req.method ?? "POST";
+
+ // Parse JSON body
+ let parsed: Record;
+ try {
+ parsed = JSON.parse(body) as Record;
+ } catch {
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: null,
+ response: { status: 400, fixture: null },
+ });
+ writeErrorResponse(
+ res,
+ 400,
+ JSON.stringify({
+ error: { message: "Malformed JSON", type: "invalid_request_error", code: "invalid_json" },
+ }),
+ );
+ return;
+ }
+
+ // Extract prompt text based on subType
+ let promptText: string | undefined;
+ if (subType === "sound-generation") {
+ if (typeof parsed.text === "string" && parsed.text) {
+ promptText = parsed.text;
+ }
+ } else {
+ // music, music-stream, music-plan all use "prompt" (or composition_plan fallback)
+ if (typeof parsed.prompt === "string" && parsed.prompt) {
+ promptText = parsed.prompt;
+ } else if (parsed.composition_plan != null) {
+ promptText =
+ typeof parsed.composition_plan === "string"
+ ? parsed.composition_plan
+ : JSON.stringify(parsed.composition_plan);
+ }
+ }
+
+ // Build synthetic ChatCompletionRequest for fixture matching (needed for journal even on validation failure)
+ const syntheticReq: ChatCompletionRequest = {
+ model:
+ (parsed.model_id as string) ??
+ (subType === "sound-generation" ? "eleven_text_to_sound_v2" : "music_v1"),
+ messages: [{ role: "user", content: promptText ?? "" }],
+ _endpointType: "audio-gen",
+ };
+
+ // Validate required field
+ if (!promptText) {
+ const field = subType === "sound-generation" ? "text" : "prompt";
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 400, fixture: null },
+ });
+ writeErrorResponse(
+ res,
+ 400,
+ JSON.stringify({
+ error: {
+ message: `Missing required parameter: '${field}'`,
+ type: "invalid_request_error",
+ },
+ }),
+ );
+ return;
+ }
+
+ // Match fixture
+ const testId = getTestId(req);
+ const matchCounts = journal.getFixtureMatchCountsForTest(testId);
+ const fixture = matchFixture(fixtures, syntheticReq, matchCounts, defaults.requestTransform);
+
+ if (fixture) {
+ journal.incrementFixtureMatchCount(fixture, fixtures, testId);
+ }
+
+ // No fixture match
+ if (!fixture) {
+ if (defaults.record) {
+ const proxied = await proxyAndRecord(
+ req,
+ res,
+ syntheticReq,
+ "elevenlabs",
+ req.url ?? "/v1/sound-generation",
+ fixtures,
+ defaults,
+ body,
+ );
+ if (proxied) {
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: syntheticReq,
+ response: { status: res.statusCode ?? 200, fixture: null },
+ });
+ return;
+ }
+ }
+
+ const strictStatus = defaults.strict ? 503 : 404;
+ const strictMessage = defaults.strict
+ ? "Strict mode: no fixture matched"
+ : "No fixture matched";
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: syntheticReq,
+ response: { status: strictStatus, fixture: null },
+ });
+ writeErrorResponse(
+ res,
+ strictStatus,
+ JSON.stringify({
+ error: { message: strictMessage, type: "invalid_request_error", code: "no_fixture_match" },
+ }),
+ );
+ return;
+ }
+
+ const response = fixture.response;
+
+ // Error fixture
+ if (isErrorResponse(response)) {
+ const status = response.status ?? 500;
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: syntheticReq,
+ response: { status, fixture },
+ });
+ writeErrorResponse(res, status, JSON.stringify(response));
+ return;
+ }
+
+ // plan returns JSON text, not audio
+ if (subType === "plan") {
+ if (!isTextResponse(response)) {
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 500, fixture },
+ });
+ writeErrorResponse(
+ res,
+ 500,
+ JSON.stringify({
+ error: {
+ message: "Fixture response is not a text type for plan endpoint",
+ type: "server_error",
+ },
+ }),
+ );
+ return;
+ }
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 200, fixture },
+ });
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(response.content);
+ return;
+ }
+
+ // All other subTypes expect audio
+ if (!isAudioResponse(response)) {
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 500, fixture },
+ });
+ writeErrorResponse(
+ res,
+ 500,
+ JSON.stringify({
+ error: { message: "Fixture response is not an audio type", type: "server_error" },
+ }),
+ );
+ return;
+ }
+
+ // Decode audio bytes and determine content type
+ let audioBytes: Buffer;
+ let contentType: string;
+
+ if (typeof response.audio === "string") {
+ audioBytes = Buffer.from(response.audio, "base64");
+ const format = response.format ?? "mp3";
+ contentType = FORMAT_TO_CONTENT_TYPE[format] ?? "audio/mpeg";
+ } else {
+ audioBytes = Buffer.from(response.audio.b64Json, "base64");
+ contentType = response.audio.contentType ?? "audio/mpeg";
+ }
+
+ // Music endpoints get a song-id header
+ if (subType === "music" || subType === "stream") {
+ res.setHeader("song-id", "mock-song-" + Date.now());
+ }
+
+ // Stream uses chunked transfer encoding
+ if (subType === "stream") {
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 200, fixture },
+ });
+ res.writeHead(200, {
+ "Content-Type": contentType,
+ "Transfer-Encoding": "chunked",
+ });
+ res.end(audioBytes);
+ return;
+ }
+
+ // Standard binary response for sound-generation and music
+ journal.add({
+ method,
+ path,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 200, fixture },
+ });
+ res.writeHead(200, { "Content-Type": contentType });
+ res.end(audioBytes);
+}
diff --git a/src/fal-audio.ts b/src/fal-audio.ts
new file mode 100644
index 0000000..88b74d5
--- /dev/null
+++ b/src/fal-audio.ts
@@ -0,0 +1,622 @@
+import type http from "node:http";
+import crypto from "node:crypto";
+import type { AudioResponse, ChatCompletionRequest, Fixture, HandlerDefaults } from "./types.js";
+import { isAudioResponse, isErrorResponse, FORMAT_TO_CONTENT_TYPE, getTestId } from "./helpers.js";
+import { matchFixture } from "./router.js";
+import { proxyAndRecord } from "./recorder.js";
+import type { Journal } from "./journal.js";
+
+// ─── FalJobMap with TTL and size bound ───────────────────────────────────
+
+const FAL_JOB_MAX_ENTRIES = 10_000;
+const FAL_JOB_TTL_MS = 3_600_000; // 1 hour
+
+interface FalJob {
+ requestId: string;
+ modelId: string;
+ status: "IN_QUEUE" | "IN_PROGRESS" | "COMPLETED";
+ result: Record | null;
+ createdAt: number;
+}
+
+interface FalJobEntry {
+ job: FalJob;
+ createdAt: number;
+}
+
+/**
+ * A Map wrapper for fal.ai queue jobs that enforces a maximum size and per-entry TTL.
+ * Entries older than FAL_JOB_TTL_MS are lazily evicted on `get`.
+ * When the map exceeds FAL_JOB_MAX_ENTRIES on `set`, the oldest entries
+ * are removed to stay within bounds.
+ */
+export class FalJobMap {
+ private readonly entries = new Map();
+
+ get(key: string): FalJob | undefined {
+ const entry = this.entries.get(key);
+ if (!entry) return undefined;
+ if (Date.now() - entry.createdAt > FAL_JOB_TTL_MS) {
+ this.entries.delete(key);
+ return undefined;
+ }
+ return entry.job;
+ }
+
+ set(key: string, job: FalJob): void {
+ this.entries.set(key, { job, createdAt: Date.now() });
+ // Evict oldest entries if over capacity
+ if (this.entries.size > FAL_JOB_MAX_ENTRIES) {
+ const excess = this.entries.size - FAL_JOB_MAX_ENTRIES;
+ const iter = this.entries.keys();
+ for (let i = 0; i < excess; i++) {
+ const next = iter.next();
+ if (!next.done) this.entries.delete(next.value);
+ }
+ }
+ }
+
+ delete(key: string): boolean {
+ return this.entries.delete(key);
+ }
+
+ clear(): void {
+ this.entries.clear();
+ }
+
+ get size(): number {
+ return this.entries.size;
+ }
+}
+
+// Module-level singleton — exported so server.ts can clear it during reset
+export const falJobs = new FalJobMap();
+
+// ─── Audio response translation ──────────────────────────────────────────
+
+function audioToFalFile(response: AudioResponse): Record {
+ let contentType: string;
+ let data: string;
+
+ if (typeof response.audio === "string") {
+ data = response.audio;
+ contentType = FORMAT_TO_CONTENT_TYPE[response.format ?? "mp3"] ?? "audio/mpeg";
+ } else {
+ data = response.audio.b64Json;
+ contentType = response.audio.contentType ?? "audio/mpeg";
+ }
+
+ const ext =
+ response.format ??
+ (contentType !== "audio/mpeg"
+ ? (Object.entries(FORMAT_TO_CONTENT_TYPE).find(([, v]) => v === contentType)?.[0] ?? "mp3")
+ : "mp3");
+
+ const fileSize =
+ Math.ceil((data.length * 3) / 4) - (data.endsWith("==") ? 2 : data.endsWith("=") ? 1 : 0);
+
+ return {
+ audio: {
+ url: `https://mock.fal.media/files/generated_audio.${ext}`,
+ content_type: contentType,
+ file_name: `generated_audio.${ext}`,
+ file_size: fileSize,
+ },
+ };
+}
+
+// ─── Route patterns ──────────────────────────────────────────────────────
+
+const QUEUE_SUBMIT_RE = /^\/fal\/queue\/submit\/(.+)$/;
+const QUEUE_STATUS_RE = /^\/fal\/queue\/requests\/([^/]+)\/status$/;
+const QUEUE_RESULT_RE = /^\/fal\/queue\/requests\/([^/]+)$/;
+const QUEUE_CANCEL_RE = /^\/fal\/queue\/requests\/([^/]+)\/cancel$/;
+const SYNC_RUN_RE = /^\/fal\/run\/(.+)$/;
+
+// ─── Handler ─────────────────────────────────────────────────────────────
+
+export async function handleFalQueue(
+ req: http.IncomingMessage,
+ res: http.ServerResponse,
+ body: string,
+ pathname: string,
+ fixtures: Fixture[],
+ defaults: HandlerDefaults,
+ journal: Journal,
+): Promise {
+ const testId = getTestId(req);
+ const matchCounts = journal.getFixtureMatchCountsForTest(testId);
+
+ // ── Queue Submit ───────────────────────────────────────────────────
+ const submitMatch = QUEUE_SUBMIT_RE.exec(pathname);
+ if (submitMatch && req.method === "POST") {
+ const modelId = submitMatch[1];
+ return handleQueueSubmit(
+ req,
+ res,
+ body,
+ pathname,
+ modelId,
+ testId,
+ fixtures,
+ defaults,
+ matchCounts,
+ journal,
+ );
+ }
+
+ // ── Queue Status ───────────────────────────────────────────────────
+ const statusMatch = QUEUE_STATUS_RE.exec(pathname);
+ if (statusMatch) {
+ const requestId = statusMatch[1];
+ return handleQueueStatus(req, res, pathname, requestId, testId, journal);
+ }
+
+ // ── Queue Cancel ───────────────────────────────────────────────────
+ const cancelMatch = QUEUE_CANCEL_RE.exec(pathname);
+ if (cancelMatch) {
+ const requestId = cancelMatch[1];
+ return handleQueueCancel(req, res, pathname, requestId, testId, journal);
+ }
+
+ // ── Queue Result ───────────────────────────────────────────────────
+ const resultMatch = QUEUE_RESULT_RE.exec(pathname);
+ if (resultMatch) {
+ const requestId = resultMatch[1];
+ return handleQueueResult(req, res, pathname, requestId, testId, journal);
+ }
+
+ // ── Synchronous Run ────────────────────────────────────────────────
+ const runMatch = SYNC_RUN_RE.exec(pathname);
+ if (runMatch && req.method === "POST") {
+ const modelId = runMatch[1];
+ return handleSyncRun(
+ req,
+ res,
+ body,
+ pathname,
+ modelId,
+ fixtures,
+ defaults,
+ matchCounts,
+ journal,
+ );
+ }
+
+ // Unknown fal path
+ const errorBody = { error: { message: "Unknown fal.ai endpoint", type: "not_found" } };
+ journal.add({
+ method: req.method ?? "GET",
+ path: pathname,
+ headers: {},
+ body: null,
+ response: { status: 404, fixture: null },
+ });
+ res.writeHead(404, { "Content-Type": "application/json" });
+ res.end(JSON.stringify(errorBody));
+}
+
+// ─── Sub-handlers ────────────────────────────────────────────────────────
+
+async function handleQueueSubmit(
+ req: http.IncomingMessage,
+ res: http.ServerResponse,
+ body: string,
+ pathname: string,
+ modelId: string,
+ testId: string,
+ fixtures: Fixture[],
+ defaults: HandlerDefaults,
+ matchCounts: Map,
+ journal: Journal,
+): Promise {
+ let parsed: Record = {};
+ if (body.trim()) {
+ try {
+ parsed = JSON.parse(body) as Record;
+ } catch {
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: null,
+ response: { status: 400, fixture: null },
+ });
+ res.writeHead(400, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ error: { message: "Malformed JSON", type: "invalid_request_error" },
+ }),
+ );
+ return;
+ }
+ }
+
+ const prompt =
+ (typeof parsed.prompt === "string" ? parsed.prompt : null) ??
+ (typeof parsed.text === "string" ? parsed.text : null) ??
+ "";
+
+ const syntheticReq: ChatCompletionRequest = {
+ model: modelId,
+ messages: [{ role: "user", content: prompt }],
+ _endpointType: "fal-audio",
+ };
+
+ const fixture = matchFixture(fixtures, syntheticReq, matchCounts, defaults.requestTransform);
+
+ if (!fixture) {
+ if (defaults.record) {
+ const proxied = await proxyAndRecord(
+ req,
+ res,
+ syntheticReq,
+ "fal",
+ pathname,
+ fixtures,
+ defaults,
+ body,
+ );
+ if (proxied) {
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status: res.statusCode ?? 200, fixture: null },
+ });
+ return;
+ }
+ }
+
+ const strictStatus = defaults.strict ? 503 : 404;
+ const strictMessage = defaults.strict
+ ? "Strict mode: no fixture matched"
+ : "No fixture matched";
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status: strictStatus, fixture: null },
+ });
+ res.writeHead(strictStatus, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ error: {
+ message: strictMessage,
+ type: "invalid_request_error",
+ code: "no_fixture_match",
+ },
+ }),
+ );
+ return;
+ }
+
+ journal.incrementFixtureMatchCount(fixture, fixtures, testId);
+ const response = fixture.response;
+
+ if (isErrorResponse(response)) {
+ const status = response.status ?? 500;
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status, fixture },
+ });
+ res.writeHead(status, { "Content-Type": "application/json" });
+ res.end(JSON.stringify(response));
+ return;
+ }
+
+ if (!isAudioResponse(response)) {
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 500, fixture },
+ });
+ res.writeHead(500, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ error: { message: "Fixture response is not an audio type", type: "server_error" },
+ }),
+ );
+ return;
+ }
+
+ const requestId = crypto.randomUUID();
+ const result = audioToFalFile(response);
+
+ const job: FalJob = {
+ requestId,
+ modelId,
+ status: "COMPLETED",
+ result,
+ createdAt: Date.now(),
+ };
+
+ const stateKey = `${testId}:${requestId}`;
+ falJobs.set(stateKey, job);
+
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 200, fixture },
+ });
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ request_id: requestId,
+ response_url: `https://queue.fal.run/${modelId}/requests/${requestId}/response`,
+ status_url: `https://queue.fal.run/${modelId}/requests/${requestId}/status`,
+ cancel_url: `https://queue.fal.run/${modelId}/requests/${requestId}/cancel`,
+ queue_position: 0,
+ }),
+ );
+}
+
+function handleQueueStatus(
+ req: http.IncomingMessage,
+ res: http.ServerResponse,
+ pathname: string,
+ requestId: string,
+ testId: string,
+ journal: Journal,
+): void {
+ const stateKey = `${testId}:${requestId}`;
+ const job = falJobs.get(stateKey);
+
+ if (!job) {
+ journal.add({
+ method: req.method ?? "GET",
+ path: pathname,
+ headers: {},
+ body: null,
+ response: { status: 404, fixture: null },
+ });
+ res.writeHead(404, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ error: { message: `Request ${requestId} not found`, type: "not_found" },
+ }),
+ );
+ return;
+ }
+
+ journal.add({
+ method: req.method ?? "GET",
+ path: pathname,
+ headers: {},
+ body: null,
+ response: { status: 200, fixture: null },
+ });
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ status: job.status,
+ request_id: job.requestId,
+ response_url: `https://queue.fal.run/${job.modelId}/requests/${requestId}/response`,
+ }),
+ );
+}
+
+function handleQueueResult(
+ req: http.IncomingMessage,
+ res: http.ServerResponse,
+ pathname: string,
+ requestId: string,
+ testId: string,
+ journal: Journal,
+): void {
+ const stateKey = `${testId}:${requestId}`;
+ const job = falJobs.get(stateKey);
+
+ if (!job) {
+ journal.add({
+ method: req.method ?? "GET",
+ path: pathname,
+ headers: {},
+ body: null,
+ response: { status: 404, fixture: null },
+ });
+ res.writeHead(404, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ error: { message: `Request ${requestId} not found`, type: "not_found" },
+ }),
+ );
+ return;
+ }
+
+ journal.add({
+ method: req.method ?? "GET",
+ path: pathname,
+ headers: {},
+ body: null,
+ response: { status: 200, fixture: null },
+ });
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(JSON.stringify(job.result));
+}
+
+function handleQueueCancel(
+ req: http.IncomingMessage,
+ res: http.ServerResponse,
+ pathname: string,
+ requestId: string,
+ testId: string,
+ journal: Journal,
+): void {
+ const stateKey = `${testId}:${requestId}`;
+ const job = falJobs.get(stateKey);
+
+ if (!job) {
+ journal.add({
+ method: req.method ?? "DELETE",
+ path: pathname,
+ headers: {},
+ body: null,
+ response: { status: 404, fixture: null },
+ });
+ res.writeHead(404, { "Content-Type": "application/json" });
+ res.end(JSON.stringify({ status: "NOT_FOUND" }));
+ return;
+ }
+
+ // Since we complete immediately, cancellation always returns ALREADY_COMPLETED
+ journal.add({
+ method: req.method ?? "DELETE",
+ path: pathname,
+ headers: {},
+ body: null,
+ response: { status: 400, fixture: null },
+ });
+ res.writeHead(400, { "Content-Type": "application/json" });
+ res.end(JSON.stringify({ status: "ALREADY_COMPLETED" }));
+}
+
+async function handleSyncRun(
+ req: http.IncomingMessage,
+ res: http.ServerResponse,
+ body: string,
+ pathname: string,
+ modelId: string,
+ fixtures: Fixture[],
+ defaults: HandlerDefaults,
+ matchCounts: Map,
+ journal: Journal,
+): Promise {
+ let parsed: Record = {};
+ if (body.trim()) {
+ try {
+ parsed = JSON.parse(body) as Record;
+ } catch {
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: null,
+ response: { status: 400, fixture: null },
+ });
+ res.writeHead(400, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ error: { message: "Malformed JSON", type: "invalid_request_error" },
+ }),
+ );
+ return;
+ }
+ }
+
+ const prompt =
+ (typeof parsed.prompt === "string" ? parsed.prompt : null) ??
+ (typeof parsed.text === "string" ? parsed.text : null) ??
+ "";
+
+ const syntheticReq: ChatCompletionRequest = {
+ model: modelId,
+ messages: [{ role: "user", content: prompt }],
+ _endpointType: "fal-audio",
+ };
+
+ const fixture = matchFixture(fixtures, syntheticReq, matchCounts, defaults.requestTransform);
+
+ if (!fixture) {
+ if (defaults.record) {
+ const proxied = await proxyAndRecord(
+ req,
+ res,
+ syntheticReq,
+ "fal",
+ pathname,
+ fixtures,
+ defaults,
+ body,
+ );
+ if (proxied) {
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status: res.statusCode ?? 200, fixture: null },
+ });
+ return;
+ }
+ }
+
+ const strictStatus = defaults.strict ? 503 : 404;
+ const strictMessage = defaults.strict
+ ? "Strict mode: no fixture matched"
+ : "No fixture matched";
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status: strictStatus, fixture: null },
+ });
+ res.writeHead(strictStatus, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ error: {
+ message: strictMessage,
+ type: "invalid_request_error",
+ code: "no_fixture_match",
+ },
+ }),
+ );
+ return;
+ }
+
+ journal.incrementFixtureMatchCount(fixture, fixtures, getTestId(req));
+ const response = fixture.response;
+
+ if (isErrorResponse(response)) {
+ const status = response.status ?? 500;
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status, fixture },
+ });
+ res.writeHead(status, { "Content-Type": "application/json" });
+ res.end(JSON.stringify(response));
+ return;
+ }
+
+ if (!isAudioResponse(response)) {
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 500, fixture },
+ });
+ res.writeHead(500, { "Content-Type": "application/json" });
+ res.end(
+ JSON.stringify({
+ error: { message: "Fixture response is not an audio type", type: "server_error" },
+ }),
+ );
+ return;
+ }
+
+ const result = audioToFalFile(response);
+
+ journal.add({
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: {},
+ body: syntheticReq,
+ response: { status: 200, fixture },
+ });
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(JSON.stringify(result));
+}
diff --git a/src/fixture-loader.ts b/src/fixture-loader.ts
index 0d7a210..d2b944b 100644
--- a/src/fixture-loader.ts
+++ b/src/fixture-loader.ts
@@ -381,6 +381,25 @@ export function validateFixtures(fixtures: Fixture[]): ValidationResult[] {
}
}
+ // Audio response checks — validate object-form audio
+ if (isAudioResponse(response) && typeof response.audio === "object") {
+ const audioObj = response.audio;
+ if (typeof audioObj.b64Json !== "string" || audioObj.b64Json === "") {
+ results.push({
+ severity: "error",
+ fixtureIndex: i,
+ message: "audio.b64Json must be a non-empty string",
+ });
+ }
+ if (audioObj.contentType !== undefined && typeof audioObj.contentType !== "string") {
+ results.push({
+ severity: "error",
+ fixtureIndex: i,
+ message: `audio.contentType must be a string, got ${typeof audioObj.contentType}`,
+ });
+ }
+ }
+
// Validate ResponseOverrides fields
if (
isTextResponse(response) ||
diff --git a/src/gemini.ts b/src/gemini.ts
index 94e6d5b..8729c64 100644
--- a/src/gemini.ts
+++ b/src/gemini.ts
@@ -8,6 +8,7 @@
import type * as http from "node:http";
import type {
+ AudioResponse,
ChatCompletionRequest,
ChatMessage,
Fixture,
@@ -23,7 +24,9 @@ import {
isToolCallResponse,
isContentWithToolCallsResponse,
isErrorResponse,
+ isAudioResponse,
extractOverrides,
+ formatToMime,
generateToolCallId,
flattenHeaders,
getTestId,
@@ -43,6 +46,7 @@ interface GeminiPart {
thought?: boolean;
functionCall?: { name: string; args: Record; id?: string };
functionResponse?: { name: string; response: unknown };
+ inlineData?: { mimeType: string; data: string };
}
interface GeminiContent {
@@ -449,6 +453,48 @@ function buildGeminiContentWithToolCallsResponse(
};
}
+// ─── Audio response builders ────────────────────────────────────────────────
+
+function resolveAudioInlineData(audio: AudioResponse): { mimeType: string; data: string } {
+ if (typeof audio.audio === "string") {
+ return { mimeType: formatToMime(audio.format ?? "mp3"), data: audio.audio };
+ }
+ return {
+ mimeType: audio.audio.contentType ?? "audio/mpeg",
+ data: audio.audio.b64Json,
+ };
+}
+
+function buildGeminiAudioResponse(audio: AudioResponse): GeminiResponseChunk {
+ const inlineData = resolveAudioInlineData(audio);
+ return {
+ candidates: [
+ {
+ content: { role: "model", parts: [{ inlineData }] },
+ finishReason: "STOP",
+ index: 0,
+ },
+ ],
+ usageMetadata: { promptTokenCount: 0, candidatesTokenCount: 0, totalTokenCount: 0 },
+ };
+}
+
+function buildGeminiAudioStreamChunks(audio: AudioResponse): GeminiResponseChunk[] {
+ const inlineData = resolveAudioInlineData(audio);
+ return [
+ {
+ candidates: [
+ {
+ content: { role: "model", parts: [{ inlineData }] },
+ finishReason: "STOP",
+ index: 0,
+ },
+ ],
+ usageMetadata: { promptTokenCount: 0, candidatesTokenCount: 0, totalTokenCount: 0 },
+ },
+ ];
+}
+
// ─── SSE writer for Gemini streaming ────────────────────────────────────────
interface GeminiStreamOptions {
@@ -655,6 +701,38 @@ export async function handleGemini(
return;
}
+ // Audio response
+ if (isAudioResponse(response)) {
+ const journalEntry = journal.add({
+ method: req.method ?? "POST",
+ path,
+ headers: flattenHeaders(req.headers),
+ body: completionReq,
+ response: { status: 200, fixture },
+ });
+ if (!streaming) {
+ const body = buildGeminiAudioResponse(response);
+ res.writeHead(200, { "Content-Type": "application/json" });
+ res.end(JSON.stringify(body));
+ } else {
+ const chunks = buildGeminiAudioStreamChunks(response);
+ const interruption = createInterruptionSignal(fixture);
+ const completed = await writeGeminiSSEStream(res, chunks, {
+ latency,
+ streamingProfile: fixture.streamingProfile,
+ signal: interruption?.signal,
+ onChunkSent: interruption?.tick,
+ });
+ if (!completed) {
+ if (!res.writableEnded) res.destroy();
+ journalEntry.response.interrupted = true;
+ journalEntry.response.interruptReason = interruption?.reason();
+ }
+ interruption?.cleanup();
+ }
+ return;
+ }
+
// Content + tool calls response (must be checked before isTextResponse / isToolCallResponse)
if (isContentWithToolCallsResponse(response)) {
if (response.webSearches?.length) {
diff --git a/src/helpers.ts b/src/helpers.ts
index 2b654a6..3836026 100644
--- a/src/helpers.ts
+++ b/src/helpers.ts
@@ -91,7 +91,30 @@ export function isImageResponse(r: FixtureResponse): r is ImageResponse {
}
export function isAudioResponse(r: FixtureResponse): r is AudioResponse {
- return "audio" in r && typeof (r as AudioResponse).audio === "string";
+ if (!("audio" in r)) return false;
+ const a = (r as AudioResponse).audio;
+ return typeof a === "string" || (typeof a === "object" && a !== null && "b64Json" in a);
+}
+
+/**
+ * Map audio format shorthand to MIME content types.
+ * Shared between speech, ElevenLabs, and fal audio handlers.
+ */
+export const FORMAT_TO_CONTENT_TYPE: Record = {
+ mp3: "audio/mpeg",
+ opus: "audio/opus",
+ aac: "audio/aac",
+ flac: "audio/flac",
+ wav: "audio/wav",
+ pcm: "audio/pcm",
+};
+
+/**
+ * Resolve a format string (e.g. "mp3", "opus") to its MIME content type.
+ * Falls back to "application/octet-stream" for unknown formats.
+ */
+export function formatToMime(format: string): string {
+ return FORMAT_TO_CONTENT_TYPE[format] ?? "application/octet-stream";
}
export function isTranscriptionResponse(r: FixtureResponse): r is TranscriptionResponse {
diff --git a/src/index.ts b/src/index.ts
index 6db5715..d4c13a5 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -90,6 +90,8 @@ export { handleImages } from "./images.js";
export { handleSpeech } from "./speech.js";
export { handleTranscription } from "./transcription.js";
export { handleVideoCreate, handleVideoStatus, VideoStateMap } from "./video.js";
+export { handleElevenLabsAudio } from "./elevenlabs-audio.js";
+export { handleFalQueue } from "./fal-audio.js";
// Helpers
export {
@@ -116,6 +118,8 @@ export {
isVideoResponse,
generateDeterministicEmbedding,
buildEmbeddingResponse,
+ FORMAT_TO_CONTENT_TYPE,
+ formatToMime,
} from "./helpers.js";
export type { EmbeddingAPIResponse } from "./helpers.js";
diff --git a/src/llmock.ts b/src/llmock.ts
index 519dabd..031e228 100644
--- a/src/llmock.ts
+++ b/src/llmock.ts
@@ -27,6 +27,7 @@ import { Journal } from "./journal.js";
import type { SearchFixture, SearchResult } from "./search.js";
import type { RerankFixture, RerankResult } from "./rerank.js";
import type { ModerationFixture, ModerationResult } from "./moderation.js";
+import { falJobs } from "./fal-audio.js";
export class LLMock {
private fixtures: Fixture[] = [];
@@ -167,6 +168,31 @@ export class LLMock {
});
}
+ onAudio(input: string | RegExp, response: AudioResponse): this {
+ return this.addFixture({ match: { userMessage: input }, response });
+ }
+
+ onSoundEffect(text: string | RegExp, response: AudioResponse): this {
+ return this.addFixture({
+ match: { userMessage: text, endpoint: "audio-gen" },
+ response,
+ });
+ }
+
+ onMusic(prompt: string | RegExp, response: AudioResponse): this {
+ return this.addFixture({
+ match: { userMessage: prompt, endpoint: "audio-gen" },
+ response,
+ });
+ }
+
+ onFalAudio(prompt: string | RegExp, response: AudioResponse, model?: string): this {
+ return this.addFixture({
+ match: { userMessage: prompt, endpoint: "fal-audio", ...(model ? { model } : {}) },
+ response,
+ });
+ }
+
// ---- Service mock convenience methods ----
onSearch(pattern: string | RegExp, results: SearchResult[]): this {
@@ -213,11 +239,9 @@ export class LLMock {
fixture.match.predicate = (req) => {
const result = original(req);
if (result) {
- // Defer splice so it doesn't mutate the array while matchFixture iterates it
- queueMicrotask(() => {
- const idx = this.fixtures.indexOf(fixture);
- if (idx !== -1) this.fixtures.splice(idx, 1);
- });
+ // Remove synchronously on first match to prevent race conditions
+ const idx = this.fixtures.indexOf(fixture);
+ if (idx !== -1) this.fixtures.splice(idx, 1);
}
return result;
};
@@ -293,6 +317,7 @@ export class LLMock {
this.searchFixtures.length = 0;
this.rerankFixtures.length = 0;
this.moderationFixtures.length = 0;
+ falJobs.clear();
if (this.serverInstance) {
this.serverInstance.journal.clear();
this.serverInstance.videoStates.clear();
diff --git a/src/recorder.ts b/src/recorder.ts
index 0e4d56b..f484e64 100644
--- a/src/recorder.ts
+++ b/src/recorder.ts
@@ -170,23 +170,37 @@ export async function proxyAndRecord(
if (collapsed.droppedChunks && collapsed.droppedChunks > 0) {
defaults.logger.warn(`${collapsed.droppedChunks} chunk(s) dropped during stream collapse`);
}
- if (collapsed.content === "" && (!collapsed.toolCalls || collapsed.toolCalls.length === 0)) {
+ // Audio from streamed inlineData (e.g. Gemini SSE with audio parts)
+ if (collapsed.audioB64) {
+ fixtureResponse = {
+ audio: {
+ b64Json: collapsed.audioB64,
+ contentType: collapsed.audioMimeType ?? "audio/mpeg",
+ },
+ };
+ } else if (
+ collapsed.content === "" &&
+ (!collapsed.toolCalls || collapsed.toolCalls.length === 0)
+ ) {
defaults.logger.warn("Stream collapse produced empty content — fixture may be incomplete");
- }
- const reasoningSpread = collapsed.reasoning ? { reasoning: collapsed.reasoning } : {};
- if (collapsed.toolCalls && collapsed.toolCalls.length > 0) {
- if (collapsed.content) {
- // Both content and toolCalls present — save as ContentWithToolCallsResponse
- fixtureResponse = {
- content: collapsed.content,
- toolCalls: collapsed.toolCalls,
- ...reasoningSpread,
- };
+ const reasoningSpread = collapsed.reasoning ? { reasoning: collapsed.reasoning } : {};
+ fixtureResponse = { content: collapsed.content ?? "", ...reasoningSpread };
+ } else {
+ const reasoningSpread = collapsed.reasoning ? { reasoning: collapsed.reasoning } : {};
+ if (collapsed.toolCalls && collapsed.toolCalls.length > 0) {
+ if (collapsed.content) {
+ // Both content and toolCalls present — save as ContentWithToolCallsResponse
+ fixtureResponse = {
+ content: collapsed.content,
+ toolCalls: collapsed.toolCalls,
+ ...reasoningSpread,
+ };
+ } else {
+ fixtureResponse = { toolCalls: collapsed.toolCalls, ...reasoningSpread };
+ }
} else {
- fixtureResponse = { toolCalls: collapsed.toolCalls, ...reasoningSpread };
+ fixtureResponse = { content: collapsed.content ?? "", ...reasoningSpread };
}
- } else {
- fixtureResponse = { content: collapsed.content ?? "", ...reasoningSpread };
}
} else {
// Non-streaming — try to parse as JSON
@@ -223,7 +237,10 @@ export async function proxyAndRecord(
const fixtureMatch = buildFixtureMatch(matchRequest);
// Build and save the fixture
- const fixture: Fixture = { match: fixtureMatch, response: fixtureResponse };
+ const fixture: Fixture = {
+ match: fixtureMatch,
+ response: fixtureResponse,
+ };
// Check if the match is empty (all undefined values) — warn but still save to disk
const matchValues = Object.values(fixtureMatch);
@@ -453,6 +470,10 @@ function buildFixtureResponse(
}
if (typeof first.embedding === "string" && encodingFormat === "base64") {
const buf = Buffer.from(first.embedding, "base64");
+ if (buf.byteLength % 4 !== 0) {
+ // Malformed embedding — return a zero-dimension embedding fixture
+ return { embedding: [] };
+ }
const aligned = new Uint8Array(buf).buffer; // Always offset 0
const floats = new Float32Array(aligned, 0, buf.byteLength / 4);
return { embedding: Array.from(floats) };
@@ -656,6 +677,24 @@ function buildFixtureResponse(
const content = candidate.content as Record | undefined;
if (content && Array.isArray(content.parts)) {
const parts = content.parts as Array>;
+
+ // Audio inlineData parts take priority over text
+ const audioParts = parts.filter(
+ (p: Record) =>
+ p.inlineData &&
+ typeof (p.inlineData as Record).mimeType === "string" &&
+ ((p.inlineData as Record).mimeType as string).startsWith("audio/"),
+ );
+ if (audioParts.length > 0) {
+ const inlineData = audioParts[0].inlineData as Record;
+ return {
+ audio: {
+ b64Json: String(inlineData.data ?? ""),
+ contentType: String(inlineData.mimeType),
+ },
+ };
+ }
+
const fnCallParts = parts.filter((p) => p.functionCall);
const textParts = parts.filter((p) => typeof p.text === "string" && !p.thought);
const thoughtParts = parts.filter((p) => p.thought === true && typeof p.text === "string");
@@ -863,7 +902,15 @@ function buildFixtureResponse(
/**
* Derive fixture match criteria from the original request.
*/
-type EndpointType = "chat" | "image" | "speech" | "transcription" | "video" | "embedding";
+type EndpointType =
+ | "chat"
+ | "image"
+ | "speech"
+ | "transcription"
+ | "video"
+ | "embedding"
+ | "audio-gen"
+ | "fal-audio";
function buildFixtureMatch(request: ChatCompletionRequest): {
userMessage?: string;
diff --git a/src/router.ts b/src/router.ts
index 844d97b..ebfdbb2 100644
--- a/src/router.ts
+++ b/src/router.ts
@@ -62,6 +62,8 @@ export function matchFixture(
const compatible =
(reqEndpoint === "image" && isImageResponse(r)) ||
(reqEndpoint === "speech" && isAudioResponse(r)) ||
+ (reqEndpoint === "audio-gen" && isAudioResponse(r)) ||
+ (reqEndpoint === "fal-audio" && isAudioResponse(r)) ||
(reqEndpoint === "transcription" && isTranscriptionResponse(r)) ||
(reqEndpoint === "video" && isVideoResponse(r));
if (!compatible) continue;
diff --git a/src/server.ts b/src/server.ts
index c809afd..65e7885 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -25,6 +25,7 @@ import {
isToolCallResponse,
isContentWithToolCallsResponse,
isErrorResponse,
+ isAudioResponse,
flattenHeaders,
getTestId,
} from "./helpers.js";
@@ -39,6 +40,8 @@ import { handleImages } from "./images.js";
import { handleSpeech } from "./speech.js";
import { handleTranscription } from "./transcription.js";
import { handleVideoCreate, handleVideoStatus, VideoStateMap } from "./video.js";
+import { handleElevenLabsAudio } from "./elevenlabs-audio.js";
+import { handleFalQueue, falJobs } from "./fal-audio.js";
import { handleOllama, handleOllamaGenerate } from "./ollama.js";
import { handleCohere } from "./cohere.js";
import { handleSearch, type SearchFixture } from "./search.js";
@@ -78,6 +81,11 @@ const TRANSCRIPTIONS_PATH = "/v1/audio/transcriptions";
const VIDEOS_PATH = "/v1/videos";
const VIDEOS_STATUS_RE = /^\/v1\/videos\/([^/]+)$/;
const GEMINI_PREDICT_RE = /^\/v1beta\/models\/([^:]+):predict$/;
+const ELEVENLABS_SOUND_GENERATION_PATH = "/v1/sound-generation";
+const ELEVENLABS_MUSIC_RE = /^\/v1\/music(?:\/(.+))?$/;
+const FAL_QUEUE_SUBMIT_RE = /^\/fal\/queue\/submit\/(.+)$/;
+const FAL_QUEUE_REQUESTS_RE = /^\/fal\/queue\/requests\/(.+)$/;
+const FAL_RUN_RE = /^\/fal\/run\/(.+)$/;
const DEFAULT_CHUNK_SIZE = 20;
// OpenAI-compatible endpoint suffixes for path prefix normalization.
@@ -294,6 +302,7 @@ async function handleControlAPI(
fixtures.length = 0;
journal.clear();
videoStates.clear();
+ falJobs.clear();
if (defaults.registry) {
defaults.registry.setGauge("aimock_fixtures_loaded", {}, fixtures.length);
}
@@ -572,6 +581,29 @@ async function handleCompletions(
return;
}
+ // Audio responses are not supported on the chat completions endpoint
+ if (isAudioResponse(response)) {
+ journal.add({
+ method: req.method ?? "POST",
+ path: req.url ?? COMPLETIONS_PATH,
+ headers: flattenHeaders(req.headers),
+ body,
+ response: { status: 422, fixture },
+ });
+ writeErrorResponse(
+ res,
+ 422,
+ JSON.stringify({
+ error: {
+ message:
+ "Audio responses are not supported on the chat completions endpoint. Use Gemini generateContent or a dedicated audio endpoint.",
+ type: "invalid_request_error",
+ },
+ }),
+ );
+ return;
+ }
+
// Content + tool calls response
if (isContentWithToolCallsResponse(response)) {
if (response.webSearches?.length) {
@@ -1607,6 +1639,199 @@ export async function createServer(
return;
}
+ // POST /v1/sound-generation — ElevenLabs Sound Generation API
+ if (pathname === ELEVENLABS_SOUND_GENERATION_PATH && req.method === "POST") {
+ setCorsHeaders(res);
+ try {
+ const raw = await readBody(req);
+ const chaosResult = applyChaos(
+ res,
+ null,
+ defaults.chaos,
+ req.headers,
+ journal,
+ {
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: flattenHeaders(req.headers),
+ body: { model: "", messages: [] },
+ },
+ defaults.registry,
+ defaults.logger,
+ );
+ if (chaosResult) return;
+ await handleElevenLabsAudio(req, res, raw, fixtures, defaults, journal, "sound-generation");
+ } catch (err: unknown) {
+ const msg = err instanceof Error ? err.message : "Internal error";
+ if (!res.headersSent) {
+ writeErrorResponse(
+ res,
+ 500,
+ JSON.stringify({ error: { message: msg, type: "server_error" } }),
+ );
+ } else if (!res.writableEnded) {
+ res.destroy();
+ }
+ }
+ return;
+ }
+
+ // POST /v1/music/(generation|variation|remix|extend) — ElevenLabs Music API
+ const musicMatch = pathname.match(ELEVENLABS_MUSIC_RE);
+ if (musicMatch && req.method === "POST") {
+ setCorsHeaders(res);
+ const musicSubType = musicMatch[1] ?? "music";
+ try {
+ const raw = await readBody(req);
+ const chaosResult = applyChaos(
+ res,
+ null,
+ defaults.chaos,
+ req.headers,
+ journal,
+ {
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: flattenHeaders(req.headers),
+ body: { model: "", messages: [] },
+ },
+ defaults.registry,
+ defaults.logger,
+ );
+ if (chaosResult) return;
+ await handleElevenLabsAudio(req, res, raw, fixtures, defaults, journal, musicSubType);
+ } catch (err: unknown) {
+ const msg = err instanceof Error ? err.message : "Internal error";
+ if (!res.headersSent) {
+ writeErrorResponse(
+ res,
+ 500,
+ JSON.stringify({ error: { message: msg, type: "server_error" } }),
+ );
+ } else if (!res.writableEnded) {
+ res.destroy();
+ }
+ }
+ return;
+ }
+
+ // POST /fal/queue/submit/{model} — fal.ai Queue Submit
+ const falQueueSubmitMatch = pathname.match(FAL_QUEUE_SUBMIT_RE);
+ if (falQueueSubmitMatch && req.method === "POST") {
+ setCorsHeaders(res);
+ try {
+ const raw = await readBody(req);
+ const chaosResult = applyChaos(
+ res,
+ null,
+ defaults.chaos,
+ req.headers,
+ journal,
+ {
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: flattenHeaders(req.headers),
+ body: { model: "", messages: [] },
+ },
+ defaults.registry,
+ defaults.logger,
+ );
+ if (chaosResult) return;
+ await handleFalQueue(req, res, raw, pathname, fixtures, defaults, journal);
+ } catch (err: unknown) {
+ const msg = err instanceof Error ? err.message : "Internal error";
+ if (!res.headersSent) {
+ writeErrorResponse(
+ res,
+ 500,
+ JSON.stringify({ error: { message: msg, type: "server_error" } }),
+ );
+ } else if (!res.writableEnded) {
+ res.destroy();
+ }
+ }
+ return;
+ }
+
+ // GET /fal/queue/requests/{requestId} — fal.ai Queue Status/Result
+ const falQueueRequestsMatch = pathname.match(FAL_QUEUE_REQUESTS_RE);
+ if (
+ falQueueRequestsMatch &&
+ (req.method === "GET" || req.method === "POST" || req.method === "PUT")
+ ) {
+ setCorsHeaders(res);
+ try {
+ const raw = req.method === "POST" ? await readBody(req) : "{}";
+ const chaosResult = applyChaos(
+ res,
+ null,
+ defaults.chaos,
+ req.headers,
+ journal,
+ {
+ method: req.method ?? "GET",
+ path: pathname,
+ headers: flattenHeaders(req.headers),
+ body: { model: "", messages: [] },
+ },
+ defaults.registry,
+ defaults.logger,
+ );
+ if (chaosResult) return;
+ await handleFalQueue(req, res, raw, pathname, fixtures, defaults, journal);
+ } catch (err: unknown) {
+ const msg = err instanceof Error ? err.message : "Internal error";
+ if (!res.headersSent) {
+ writeErrorResponse(
+ res,
+ 500,
+ JSON.stringify({ error: { message: msg, type: "server_error" } }),
+ );
+ } else if (!res.writableEnded) {
+ res.destroy();
+ }
+ }
+ return;
+ }
+
+ // POST /fal/run/{model} — fal.ai Synchronous Run
+ const falRunMatch = pathname.match(FAL_RUN_RE);
+ if (falRunMatch && req.method === "POST") {
+ setCorsHeaders(res);
+ try {
+ const raw = await readBody(req);
+ const chaosResult = applyChaos(
+ res,
+ null,
+ defaults.chaos,
+ req.headers,
+ journal,
+ {
+ method: req.method ?? "POST",
+ path: pathname,
+ headers: flattenHeaders(req.headers),
+ body: { model: "", messages: [] },
+ },
+ defaults.registry,
+ defaults.logger,
+ );
+ if (chaosResult) return;
+ await handleFalQueue(req, res, raw, pathname, fixtures, defaults, journal);
+ } catch (err: unknown) {
+ const msg = err instanceof Error ? err.message : "Internal error";
+ if (!res.headersSent) {
+ writeErrorResponse(
+ res,
+ 500,
+ JSON.stringify({ error: { message: msg, type: "server_error" } }),
+ );
+ } else if (!res.writableEnded) {
+ res.destroy();
+ }
+ }
+ return;
+ }
+
// POST /v1/chat/completions — Chat Completions API
if (pathname !== COMPLETIONS_PATH) {
handleNotFound(res, "Not found");
diff --git a/src/speech.ts b/src/speech.ts
index 980d179..f3664bd 100644
--- a/src/speech.ts
+++ b/src/speech.ts
@@ -1,6 +1,12 @@
import type * as http from "node:http";
import type { ChatCompletionRequest, Fixture, HandlerDefaults } from "./types.js";
-import { isAudioResponse, isErrorResponse, flattenHeaders, getTestId } from "./helpers.js";
+import {
+ isAudioResponse,
+ isErrorResponse,
+ flattenHeaders,
+ getTestId,
+ FORMAT_TO_CONTENT_TYPE,
+} from "./helpers.js";
import { matchFixture } from "./router.js";
import { writeErrorResponse } from "./sse-writer.js";
import type { Journal } from "./journal.js";
@@ -16,15 +22,6 @@ interface SpeechRequest {
[key: string]: unknown;
}
-const FORMAT_TO_CONTENT_TYPE: Record = {
- mp3: "audio/mpeg",
- opus: "audio/opus",
- aac: "audio/aac",
- flac: "audio/flac",
- wav: "audio/wav",
- pcm: "audio/pcm",
-};
-
export async function handleSpeech(
req: http.IncomingMessage,
res: http.ServerResponse,
@@ -190,6 +187,29 @@ export async function handleSpeech(
return;
}
+ // Object-form audio is not supported for the speech endpoint — reject early
+ if (typeof response.audio !== "string") {
+ journal.add({
+ method,
+ path,
+ headers: flattenHeaders(req.headers),
+ body: syntheticReq,
+ response: { status: 500, fixture },
+ });
+ writeErrorResponse(
+ res,
+ 500,
+ JSON.stringify({
+ error: {
+ message:
+ "Object-form audio not supported for speech endpoint. Use string-form: { audio: '' }",
+ type: "server_error",
+ },
+ }),
+ );
+ return;
+ }
+
journal.add({
method,
path,
diff --git a/src/stream-collapse.ts b/src/stream-collapse.ts
index 5634dd6..505b248 100644
--- a/src/stream-collapse.ts
+++ b/src/stream-collapse.ts
@@ -22,6 +22,8 @@ export interface CollapseResult {
toolCalls?: ToolCall[];
droppedChunks?: number;
truncated?: boolean;
+ audioB64?: string;
+ audioMimeType?: string;
}
// ---------------------------------------------------------------------------
@@ -259,7 +261,10 @@ export function collapseAnthropicSSE(body: string): CollapseResult {
export function collapseGeminiSSE(body: string): CollapseResult {
const lines = body.split("\n\n").filter((l) => l.trim().length > 0);
let content = "";
+ let reasoning = "";
let droppedChunks = 0;
+ let audioB64 = "";
+ let audioMimeType: string | undefined;
const toolCalls: ToolCall[] = [];
for (const line of lines) {
@@ -292,21 +297,50 @@ export function collapseGeminiSSE(body: string): CollapseResult {
name: String(fc.name ?? ""),
arguments: typeof fc.args === "string" ? (fc.args as string) : JSON.stringify(fc.args),
});
+ } else if (
+ part.inlineData &&
+ typeof (part.inlineData as Record).mimeType === "string" &&
+ ((part.inlineData as Record).mimeType as string).startsWith("audio/")
+ ) {
+ const inlineData = part.inlineData as Record;
+ if (!audioMimeType) {
+ audioMimeType = inlineData.mimeType as string;
+ }
+ if (typeof inlineData.data === "string") {
+ audioB64 += inlineData.data;
+ }
} else if (typeof part.text === "string") {
- content += part.text;
+ if (part.thought) {
+ reasoning += part.text;
+ } else {
+ content += part.text;
+ }
}
}
}
+ if (audioB64) {
+ return {
+ audioB64,
+ audioMimeType,
+ ...(droppedChunks > 0 ? { droppedChunks } : {}),
+ };
+ }
+
if (toolCalls.length > 0) {
return {
...(content ? { content } : {}),
toolCalls,
+ ...(reasoning ? { reasoning } : {}),
...(droppedChunks > 0 ? { droppedChunks } : {}),
};
}
- return { content, ...(droppedChunks > 0 ? { droppedChunks } : {}) };
+ return {
+ content,
+ ...(reasoning ? { reasoning } : {}),
+ ...(droppedChunks > 0 ? { droppedChunks } : {}),
+ };
}
// ---------------------------------------------------------------------------
@@ -453,6 +487,7 @@ export function collapseCohereSSE(body: string): CollapseResult {
if (toolCallMap.size > 0) {
const sorted = Array.from(toolCallMap.entries()).sort(([a], [b]) => a - b);
return {
+ ...(content ? { content } : {}),
toolCalls: sorted.map(([, tc]) => ({
name: tc.name,
arguments: tc.arguments,
diff --git a/src/types.ts b/src/types.ts
index 41c7546..d1086e0 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -74,7 +74,15 @@ export interface FixtureMatch {
sequenceIndex?: number;
turnIndex?: number;
hasToolResult?: boolean;
- endpoint?: "chat" | "image" | "speech" | "transcription" | "video" | "embedding";
+ endpoint?:
+ | "chat"
+ | "image"
+ | "speech"
+ | "transcription"
+ | "video"
+ | "embedding"
+ | "audio-gen"
+ | "fal-audio";
}
// Fixture response types
@@ -157,7 +165,7 @@ export interface ImageResponse {
}
export interface AudioResponse {
- audio: string;
+ audio: string | { b64Json: string; contentType?: string };
format?: string;
}
@@ -281,7 +289,15 @@ export interface FixtureFileEntry {
sequenceIndex?: number;
turnIndex?: number;
hasToolResult?: boolean;
- endpoint?: "chat" | "image" | "speech" | "transcription" | "video" | "embedding";
+ endpoint?:
+ | "chat"
+ | "image"
+ | "speech"
+ | "transcription"
+ | "video"
+ | "embedding"
+ | "audio-gen"
+ | "fal-audio";
// predicate not supported in JSON files
};
response: FixtureFileResponse;
@@ -381,7 +397,9 @@ export type RecordProviderKey =
| "bedrock"
| "azure"
| "ollama"
- | "cohere";
+ | "cohere"
+ | "elevenlabs"
+ | "fal";
export interface RecordConfig {
providers: Partial>;
diff --git a/src/ws-gemini-live.ts b/src/ws-gemini-live.ts
index 1880628..0e9daad 100644
--- a/src/ws-gemini-live.ts
+++ b/src/ws-gemini-live.ts
@@ -6,9 +6,23 @@
* messages in the Gemini Live streaming format.
*/
-import type { Fixture, ChatMessage, ChatCompletionRequest, ToolDefinition } from "./types.js";
+import type {
+ Fixture,
+ ChatMessage,
+ ChatCompletionRequest,
+ ToolDefinition,
+ AudioResponse,
+} from "./types.js";
import { matchFixture } from "./router.js";
-import { isTextResponse, isToolCallResponse, isErrorResponse } from "./helpers.js";
+import {
+ isTextResponse,
+ isToolCallResponse,
+ isContentWithToolCallsResponse,
+ isErrorResponse,
+ isAudioResponse,
+ formatToMime,
+ generateToolCallId,
+} from "./helpers.js";
import { createInterruptionSignal } from "./interruption.js";
import { delay } from "./sse-writer.js";
import { DEFAULT_TEST_ID, type Journal } from "./journal.js";
@@ -19,8 +33,10 @@ import type { WebSocketConnection } from "./ws-framing.js";
interface GeminiLivePart {
text?: string;
+ thought?: boolean;
functionCall?: { name: string; args: Record };
functionResponse?: { name: string; response: unknown; id?: string };
+ inlineData?: { mimeType: string; data: string };
}
interface GeminiLiveTurn {
@@ -89,7 +105,9 @@ function geminiTurnsToMessages(turns: GeminiLiveTurn[]): ChatMessage[] {
if (role === "user") {
const funcResponses = turn.parts.filter((p) => p.functionResponse);
- const textParts = turn.parts.filter((p) => p.text !== undefined);
+ // inlineData parts (e.g. client audio input) are silently skipped —
+ // only text and functionResponse parts are relevant for fixture matching.
+ const textParts = turn.parts.filter((p) => p.text !== undefined && !p.thought);
if (funcResponses.length > 0) {
for (let i = 0; i < funcResponses.length; i++) {
@@ -113,7 +131,7 @@ function geminiTurnsToMessages(turns: GeminiLiveTurn[]): ChatMessage[] {
}
} else if (role === "model") {
const funcCalls = turn.parts.filter((p) => p.functionCall);
- const textParts = turn.parts.filter((p) => p.text !== undefined);
+ const textParts = turn.parts.filter((p) => p.text !== undefined && !p.thought);
if (funcCalls.length > 0) {
messages.push({
@@ -327,6 +345,13 @@ async function processMessage(
if (!fixture) {
if (defaults.strict) {
defaults.logger.warn(`STRICT: No fixture matched for WebSocket message`);
+ journal.add({
+ method: "WS",
+ path,
+ headers: {},
+ body: completionReq,
+ response: { status: 404, fixture: null },
+ });
ws.close(1008, "Strict mode: no fixture matched");
return;
}
@@ -370,6 +395,184 @@ async function processMessage(
return;
}
+ // Audio response — single frame with inlineData and turnComplete: true
+ if (isAudioResponse(response)) {
+ journal.add({
+ method: "WS",
+ path,
+ headers: {},
+ body: completionReq,
+ response: { status: 200, fixture },
+ });
+
+ const audioResp = response as AudioResponse;
+ let mimeType: string;
+ let data: string;
+
+ if (typeof audioResp.audio === "string") {
+ mimeType = formatToMime(audioResp.format ?? "mp3");
+ data = audioResp.audio;
+ } else {
+ mimeType = audioResp.audio.contentType ?? "audio/mpeg";
+ data = audioResp.audio.b64Json;
+ }
+
+ ws.send(
+ JSON.stringify({
+ serverContent: {
+ modelTurn: {
+ parts: [{ inlineData: { mimeType, data } }],
+ },
+ turnComplete: true,
+ },
+ }),
+ );
+
+ session.conversationHistory.push({
+ role: "assistant",
+ content: "[audio]",
+ });
+ return;
+ }
+
+ // Content + tool calls response (must be checked before isTextResponse / isToolCallResponse)
+ if (isContentWithToolCallsResponse(response)) {
+ const journalEntry = journal.add({
+ method: "WS",
+ path,
+ headers: {},
+ body: completionReq,
+ response: { status: 200, fixture },
+ });
+
+ const content = response.content;
+ const chunkList: string[] = [];
+ for (let i = 0; i < content.length; i += chunkSize) {
+ chunkList.push(content.slice(i, i + chunkSize));
+ }
+
+ const interruption = createInterruptionSignal(fixture);
+ let interrupted = false;
+
+ // Stream text content chunks
+ if (content.length === 0) {
+ if (!ws.isClosed) {
+ ws.send(
+ JSON.stringify({
+ serverContent: {
+ modelTurn: { parts: [{ text: "" }] },
+ turnComplete: false,
+ },
+ }),
+ );
+ }
+ } else {
+ for (let i = 0; i < chunkList.length; i++) {
+ if (ws.isClosed) break;
+ if (latency > 0) await delay(latency, interruption?.signal);
+ if (interruption?.signal.aborted) {
+ interrupted = true;
+ break;
+ }
+ if (ws.isClosed) break;
+
+ ws.send(
+ JSON.stringify({
+ serverContent: {
+ modelTurn: { parts: [{ text: chunkList[i] }] },
+ turnComplete: false,
+ },
+ }),
+ );
+ interruption?.tick();
+ if (interruption?.signal.aborted) {
+ interrupted = true;
+ break;
+ }
+ }
+ }
+
+ if (interrupted) {
+ ws.destroy();
+ journalEntry.response.interrupted = true;
+ journalEntry.response.interruptReason = interruption?.reason();
+ interruption?.cleanup();
+ return;
+ }
+
+ // Pre-compute tool calls with stable IDs so wire message and history match
+ const resolvedToolCalls = response.toolCalls.map((tc) => ({
+ ...tc,
+ resolvedId: tc.id ?? generateToolCallId(),
+ }));
+
+ // Send tool calls
+ if (!ws.isClosed) {
+ if (latency > 0) await delay(latency, interruption?.signal);
+ if (interruption?.signal.aborted) {
+ ws.destroy();
+ journalEntry.response.interrupted = true;
+ journalEntry.response.interruptReason = interruption?.reason();
+ interruption?.cleanup();
+ return;
+ }
+
+ const functionCalls = resolvedToolCalls.map((tc) => {
+ let argsObj: Record;
+ try {
+ argsObj = JSON.parse(tc.arguments || "{}") as Record;
+ } catch {
+ defaults.logger.warn(
+ `Malformed JSON in fixture tool call arguments for "${tc.name}": ${tc.arguments}`,
+ );
+ argsObj = {};
+ }
+ return {
+ name: tc.name,
+ args: argsObj,
+ id: tc.resolvedId,
+ };
+ });
+
+ ws.send(JSON.stringify({ toolCall: { functionCalls } }));
+ interruption?.tick();
+ }
+
+ if (interruption?.signal.aborted) {
+ ws.destroy();
+ journalEntry.response.interrupted = true;
+ journalEntry.response.interruptReason = interruption?.reason();
+ interruption?.cleanup();
+ return;
+ }
+
+ interruption?.cleanup();
+
+ // Send turnComplete
+ if (!ws.isClosed) {
+ ws.send(
+ JSON.stringify({
+ serverContent: { turnComplete: true },
+ }),
+ );
+ }
+
+ // Add to conversation history using the same resolved IDs from the wire message
+ session.conversationHistory.push({
+ role: "assistant",
+ content: content || null,
+ tool_calls: resolvedToolCalls.map((tc) => ({
+ id: tc.resolvedId,
+ type: "function" as const,
+ function: {
+ name: tc.name,
+ arguments: tc.arguments,
+ },
+ })),
+ });
+ return;
+ }
+
// Text response — stream chunks with serverContent
if (isTextResponse(response)) {
const journalEntry = journal.add({