Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions src/claude_agent_sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,12 +360,27 @@ def _build_schema(tool_def: SdkMcpTool[Any]) -> dict[str, Any]:
return _typeddict_to_json_schema(tool_def.input_schema)
return {"type": "object", "properties": {}}

def _build_meta(tool_def: "SdkMcpTool[Any]") -> dict[str, Any] | None:
# The MCP SDK's Zod schema strips unknown annotation fields, so
# Anthropic-specific hints use _meta with namespaced keys instead.
# maxResultSizeChars controls the CLI's layer-2 tool-result spill
# threshold (toolResultStorage.ts maybePersistLargeToolResult).
if tool_def.annotations is None:
return None
max_size = getattr(tool_def.annotations, "maxResultSizeChars", None)
if max_size is None:
return None
return {"anthropic/maxResultSizeChars": max_size}

cached_tool_list = [
Tool(
name=tool_def.name,
description=tool_def.description,
inputSchema=_build_schema(tool_def),
annotations=tool_def.annotations,
Tool.model_validate(
{
"name": tool_def.name,
"description": tool_def.description,
"inputSchema": _build_schema(tool_def),
"annotations": tool_def.annotations,
"_meta": _build_meta(tool_def),
}
)
for tool_def in tools
]
Expand Down
2 changes: 2 additions & 0 deletions src/claude_agent_sdk/_internal/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,8 @@ async def _handle_sdk_mcp_request(
tool_data["annotations"] = tool.annotations.model_dump(
exclude_none=True
)
if tool.meta:
tool_data["_meta"] = tool.meta
tools_data.append(tool_data)
return {
"jsonrpc": "2.0",
Expand Down
335 changes: 335 additions & 0 deletions tests/test_mcp_large_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,335 @@
"""Tests for large-MCP-result spill behavior and env-var passthrough.

Root cause (confirmed via claude-cli-internal, 2026-03-27):
Two independent spill layers in the bundled CLI:

Layer 1 — MCP-specific (mcpValidation.ts)
Threshold: MAX_MCP_OUTPUT_TOKENS env var, default 25 000 tokens.
Setting MAX_MCP_OUTPUT_TOKENS=500000 bypasses this layer.
Output on spill: plain "Error: result exceeds maximum allowed tokens…"

Layer 2 — generic tool-result (toolResultStorage.ts maybePersistLargeToolResult)
Threshold: DEFAULT_MAX_RESULT_SIZE_CHARS = 50 000 chars, hardcoded in
toolLimits.ts. No env var reads this constant. MCPTool declares
maxResultSizeChars: 100_000 but getPersistenceThreshold clamps it to
Math.min(100_000, 50_000) = 50 K.
Output on spill: <persisted-output> tag + 2 KB preview — exactly what
customers observe.

Regression timeline:
PR #13609 (2026-01-06) removed the feature gate → layer 2 always-on for SDK builds.
Before this gate removal, SDK 0.1.17 / CLI 2.0.18 was unaffected.
PR #19224 (2026-02-21) lowered the external-build clamp from 100 K → 50 K chars.

Customer's MAX_MCP_OUTPUT_TOKENS=500000 bypasses layer 1 successfully; the 73 K result
then hits layer 2's 50 K char wall and produces <persisted-output>. There is currently
no env var or SDK option to raise the layer-2 threshold — a CLI change is required.

These tests confirm:
1. MAX_MCP_OUTPUT_TOKENS (layer-1 threshold) passes through to the CLI subprocess.
2. os.environ values are inherited; options.env overrides them.
3. Raising MAX_MCP_OUTPUT_TOKENS alone is NOT sufficient for >50 K results because
layer 2 is still in the path (documented via INLINE_CONTENT size boundary tests).
4. The SDK message parser surfaces <persisted-output> content unchanged so callers
can detect the degraded path and warn users.
"""

import os
from unittest.mock import AsyncMock, MagicMock, patch

import anyio

from claude_agent_sdk._internal.message_parser import parse_message
from claude_agent_sdk._internal.transport.subprocess_cli import SubprocessCLITransport
from claude_agent_sdk.types import ClaudeAgentOptions, ToolResultBlock, UserMessage

DEFAULT_CLI_PATH = "/usr/bin/claude"

# Layer-2 threshold as confirmed in claude-cli-internal toolLimits.ts
_LAYER2_THRESHOLD_CHARS = 50_000


def make_transport(env: dict | None = None, **kwargs) -> SubprocessCLITransport:
options = ClaudeAgentOptions(
cli_path=DEFAULT_CLI_PATH,
env=env or {},
**kwargs,
)
return SubprocessCLITransport(prompt="test", options=options)


# ---------------------------------------------------------------------------
# Helpers to capture the env dict passed to anyio.open_process
# ---------------------------------------------------------------------------


def _capture_env(transport: SubprocessCLITransport) -> dict[str, str]:
"""Run transport.connect() with a mocked process and return the env dict."""
captured: dict[str, str] = {}

async def _run():
mock_process = MagicMock()
mock_process.stdin = MagicMock()
mock_process.stdout = MagicMock()
mock_process.stderr = None
mock_process.returncode = None

with (
patch(
"claude_agent_sdk._internal.transport.subprocess_cli.anyio.open_process",
new_callable=AsyncMock,
return_value=mock_process,
) as mock_open,
patch(
"claude_agent_sdk._internal.transport.subprocess_cli.SubprocessCLITransport._check_claude_version",
new_callable=AsyncMock,
),
):
await transport.connect()
_, kwargs = mock_open.call_args
captured.update(kwargs.get("env", {}))

anyio.run(_run)
return captured


# ---------------------------------------------------------------------------
# 1. MAX_MCP_OUTPUT_TOKENS (layer-1) passthrough
# ---------------------------------------------------------------------------


class TestLayer1EnvPassthrough:
def test_max_mcp_output_tokens_reaches_subprocess(self):
"""MAX_MCP_OUTPUT_TOKENS set in options.env must appear in the subprocess env.

This controls layer 1 only (mcpValidation.ts, ~25K token default).
A 73K-char result bypasses layer 1 with this set, but will still hit
layer 2's 50K char hard limit — see TestLayer2Boundary below.
"""
transport = make_transport(env={"MAX_MCP_OUTPUT_TOKENS": "500000"})
env = _capture_env(transport)
assert "MAX_MCP_OUTPUT_TOKENS" in env, (
"MAX_MCP_OUTPUT_TOKENS was not passed to the CLI subprocess. "
"Layer 1 will use its default (~25K tokens) and spill to plain error text."
)
assert env["MAX_MCP_OUTPUT_TOKENS"] == "500000"

def test_default_absent_when_not_set(self):
"""When not set, the SDK must not inject a default — the CLI's own governs."""
env_without = {
k: v for k, v in os.environ.items() if k != "MAX_MCP_OUTPUT_TOKENS"
}
with patch.dict(os.environ, env_without, clear=True):
transport = make_transport(env={})
captured = _capture_env(transport)
assert "MAX_MCP_OUTPUT_TOKENS" not in captured

def test_arbitrary_threshold_values_pass_through(self):
for value in ("1", "25000", "1000000"):
transport = make_transport(env={"MAX_MCP_OUTPUT_TOKENS": value})
env = _capture_env(transport)
assert env.get("MAX_MCP_OUTPUT_TOKENS") == value


# ---------------------------------------------------------------------------
# 2. os.environ inheritance and options.env precedence
# ---------------------------------------------------------------------------


class TestEnvInheritanceAndPrecedence:
def test_inherited_from_os_environ(self):
"""MAX_MCP_OUTPUT_TOKENS set in os.environ before connect() is inherited."""
with patch.dict(os.environ, {"MAX_MCP_OUTPUT_TOKENS": "200000"}):
transport = make_transport(env={})
env = _capture_env(transport)
assert env.get("MAX_MCP_OUTPUT_TOKENS") == "200000"

def test_options_env_overrides_os_environ(self):
"""options.env wins over os.environ."""
with patch.dict(os.environ, {"MAX_MCP_OUTPUT_TOKENS": "1000"}):
transport = make_transport(env={"MAX_MCP_OUTPUT_TOKENS": "500000"})
env = _capture_env(transport)
assert env.get("MAX_MCP_OUTPUT_TOKENS") == "500000"

def test_claudecode_stripped(self):
"""CLAUDECODE is stripped so spawned subprocesses don't detect a parent CC."""
with patch.dict(os.environ, {"CLAUDECODE": "1", "OTHER_VAR": "kept"}):
transport = make_transport(env={})
env = _capture_env(transport)
assert "CLAUDECODE" not in env
assert env.get("OTHER_VAR") == "kept"

def test_sdk_managed_vars_always_set(self):
transport = make_transport(env={})
env = _capture_env(transport)
assert env.get("CLAUDE_CODE_ENTRYPOINT") == "sdk-py"
assert "CLAUDE_AGENT_SDK_VERSION" in env

def test_options_env_cannot_override_sdk_version(self):
from claude_agent_sdk._version import __version__

transport = make_transport(env={"CLAUDE_AGENT_SDK_VERSION": "0.0.0"})
env = _capture_env(transport)
assert env.get("CLAUDE_AGENT_SDK_VERSION") == __version__


# ---------------------------------------------------------------------------
# 3. Layer-2 threshold boundary (documents the unresolved gap)
# ---------------------------------------------------------------------------


class TestLayer2Boundary:
"""Layer 2 (toolResultStorage.ts maybePersistLargeToolResult) spills any result
exceeding 50 000 chars regardless of MAX_MCP_OUTPUT_TOKENS. There is currently
no env var or SDK option to raise this threshold — it requires a CLI change.

These tests document that behavior by checking the content size boundary.
"""

def test_content_under_50k_can_be_inline(self):
"""A result just below 50K chars is eligible to be passed inline by the CLI.
This verifies our understanding of the threshold constant."""
content = "x" * (_LAYER2_THRESHOLD_CHARS - 1)
assert len(content) < _LAYER2_THRESHOLD_CHARS

def test_customer_reproducer_exceeds_layer2_threshold(self):
"""The customer's ~73K-char result exceeds the 50K layer-2 threshold.

MAX_MCP_OUTPUT_TOKENS=500000 bypasses layer 1 for this result, but it
then hits layer 2 and produces <persisted-output>. This is the bug.
A fix requires exposing an env var or CLI flag for the layer-2 threshold.
"""
customer_content_size = 73_000 # chars in customer's reproducer
assert customer_content_size > _LAYER2_THRESHOLD_CHARS, (
f"Customer's {customer_content_size}-char result exceeds the "
f"{_LAYER2_THRESHOLD_CHARS}-char layer-2 threshold and will be spilled "
"to a temp file even when MAX_MCP_OUTPUT_TOKENS is raised."
)

def test_no_layer2_env_var_exists(self):
"""Confirm there is no env-var path to raise the layer-2 threshold.

The fix (Option 3) uses tool annotations instead of an env var:
ToolAnnotations(maxResultSizeChars=500_000)
The CLI reads this from the tools/list JSONRPC response and skips the
Math.min clamp in getPersistenceThreshold for that tool.

See test_max_result_size_chars_annotation_flows_to_cli in
test_sdk_mcp_integration.py for SDK-side confirmation.
"""
transport = make_transport(env={"MAX_MCP_OUTPUT_TOKENS": "500000"})
env = _capture_env(transport)
assert "MAX_TOOL_RESULT_CHARS" not in env
assert "DISABLE_TOOL_RESULT_PERSISTENCE" not in env


# ---------------------------------------------------------------------------
# 4. Message parser: inline vs persisted-output tool results
# ---------------------------------------------------------------------------


def _user_message_with_tool_result(content: str, is_error: bool = False) -> dict:
return {
"type": "user",
"message": {
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": "toolu_01ABC",
"content": content,
"is_error": is_error,
}
],
},
"parent_tool_use_id": None,
"tool_use_result": None,
"uuid": "test-uuid-1234",
}


# Below the layer-2 threshold — would be passed inline by the CLI.
INLINE_CONTENT = "x" * 1000

# What the CLI emits after layer-2 spill: <persisted-output> tag + 2 KB preview.
# Source: toolResultStorage.ts, PREVIEW_SIZE_BYTES = 2000.
PERSISTED_CONTENT = (
"<persisted-output>\n"
"Output too large (73.0KB). Full output saved to: /tmp/.claude/tool-results/abc123.txt\n"
"\nPreview (first 2KB):\n" + "x" * 2000 + "\n...\n</persisted-output>"
)


class TestToolResultParsing:
def test_inline_content_preserved(self):
"""Full tool-result content is preserved when the CLI passes it inline."""
msg = parse_message(_user_message_with_tool_result(INLINE_CONTENT))
assert isinstance(msg, UserMessage)
blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)]
assert len(blocks) == 1
assert blocks[0].content == INLINE_CONTENT
assert not str(blocks[0].content).startswith("<persisted-output>")

def test_persisted_output_detectable_by_prefix(self):
"""After a layer-2 spill, content starts with '<persisted-output>' —
callers can detect this and warn users or raise an error."""
msg = parse_message(_user_message_with_tool_result(PERSISTED_CONTENT))
assert isinstance(msg, UserMessage)
blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)]
assert len(blocks) == 1
content = str(blocks[0].content)
assert content.startswith("<persisted-output>"), (
f"Expected persisted-output wrapper, got: {content[:100]!r}"
)

def test_persisted_output_is_not_full_content(self):
"""Claude receives only the 2 KB preview, not the original large content."""
msg = parse_message(_user_message_with_tool_result(PERSISTED_CONTENT))
assert isinstance(msg, UserMessage)
blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)]
Comment on lines +287 to +289
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 In test_persisted_output_is_not_full_content, the large_content variable is dead code — it is defined but never used as input to any function. The assertion blocks[0].content != large_content is trivially true because it compares two completely unrelated hardcoded strings, so the test does not actually verify what its docstring claims ("Claude receives only the 2 KB preview, not the original large content").

Extended reasoning...

What the bug is

In test_persisted_output_is_not_full_content (line 288), the variable large_content is assigned "THE ANSWER IS: 42\n" + "padding\n" * 5000 but is never used as input to any operation. The test then parses PERSISTED_CONTENT (a module-level constant simulating CLI spill output) and asserts blocks[0].content != large_content.

Why the assertion is trivially true

PERSISTED_CONTENT is a hardcoded string starting with <persisted-output>\n... containing a 2KB preview of "x" * 2000. large_content is "THE ANSWER IS: 42\n" + "padding\n" * 5000. These two strings share no common prefix and will never be equal regardless of any code behavior. The != assertion will always pass.

Step-by-step proof

  1. large_content = "THE ANSWER IS: 42\n" + "padding\n" * 5000 — a ~40KB string starting with "THE ANSWER IS: 42"
  2. msg = parse_message(_user_message_with_tool_result(PERSISTED_CONTENT)) — parses the hardcoded PERSISTED_CONTENT, not large_content
  3. blocks[0].content will equal PERSISTED_CONTENT, which starts with <persisted-output>
  4. blocks[0].content != large_content compares PERSISTED_CONTENT to large_content — trivially True
  5. The test passes but proves nothing about the system under test

What the test should verify

The docstring says "Claude receives only the 2 KB preview, not the original large content." To meaningfully test this, large_content should be used as the input to _user_message_with_tool_result(), and then the assertion should verify the output differs from the input (or check that the output is the truncated preview). As written, large_content is dead code — a linter like ruff would flag it as F841.

Impact

This is a test-only issue with no production impact. The test provides a false sense of coverage for the persisted-output behavior. If the message parser were changed to, say, strip <persisted-output> tags or transform content in an unexpected way, this test would not catch the regression it claims to guard against.

Comment on lines +287 to +289
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 In test_persisted_output_is_not_full_content, large_content is assigned but never used as input to the test. The assertion blocks[0].content != large_content is trivially true since it compares two completely unrelated hardcoded strings (PERSISTED_CONTENT vs large_content). Remove the dead variable or feed large_content through the test fixture so the assertion is meaningful.

Extended reasoning...

The test test_persisted_output_is_not_full_content (line 288) defines large_content = "THE ANSWER IS: 42\n" + "padding\n" * 5000 but then parses PERSISTED_CONTENT (a module-level constant) via _user_message_with_tool_result(PERSISTED_CONTENT). The assertion on line 294 checks blocks[0].content != large_content, which compares the parsed PERSISTED_CONTENT string against the locally-defined large_content string.

These two strings are completely unrelated — PERSISTED_CONTENT is a <persisted-output> XML wrapper with a 2KB preview, while large_content is "THE ANSWER IS: 42\n" followed by 5000 lines of "padding\n". The != assertion will always pass regardless of any code changes, making this a no-op test.

The test docstring claims to verify "Claude receives only the 2 KB preview, not the original large content," but the test never connects large_content to the simulated tool result. For the test to be meaningful, large_content would need to be the original content that gets spilled, and the test would need to verify that the parsed result differs from that original. As written, a linter (ruff F841) would flag large_content as an unused local variable.

The impact is low since this is a test-only issue with no production consequences, but the test provides false confidence — it appears to verify spill behavior but actually verifies nothing.

To fix: either remove the dead large_content variable and rewrite the assertion to check something meaningful (e.g., that the content length is approximately 2KB, matching the preview size), or restructure the test so large_content is actually used as the pre-spill input.

content = str(blocks[0].content)
assert len(content) < _LAYER2_THRESHOLD_CHARS, (
f"Expected preview under {_LAYER2_THRESHOLD_CHARS} chars, got {len(content)}"
)

def test_error_tool_result_flagged(self):
msg = parse_message(
_user_message_with_tool_result("tool failed", is_error=True)
)
assert isinstance(msg, UserMessage)
blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)]
assert blocks[0].is_error is True

def test_normal_tool_result_not_flagged(self):
msg = parse_message(
_user_message_with_tool_result(INLINE_CONTENT, is_error=False)
)
assert isinstance(msg, UserMessage)
blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)]
assert blocks[0].is_error is False


# ---------------------------------------------------------------------------
# Utility: recommended caller pattern for detecting the degraded path
# ---------------------------------------------------------------------------


def is_persisted_output(block: ToolResultBlock) -> bool:
"""Return True if the CLI spilled this tool result to a temp file (layer 2)."""
return isinstance(block.content, str) and block.content.startswith(
"<persisted-output>"
)


class TestPersistedOutputDetectionHelper:
def test_helper_detects_persisted(self):
msg = parse_message(_user_message_with_tool_result(PERSISTED_CONTENT))
assert isinstance(msg, UserMessage)
blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)]
assert is_persisted_output(blocks[0])

def test_helper_passes_inline(self):
msg = parse_message(_user_message_with_tool_result(INLINE_CONTENT))
assert isinstance(msg, UserMessage)
blocks = [b for b in msg.content if isinstance(b, ToolResultBlock)]
assert not is_persisted_output(blocks[0])
Loading
Loading