Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1033,8 +1033,7 @@ with `MyConfig.model_validate(...)` or use the typed object directly.
class ClientConfig(BaseModel):
client_idx: int = 0
client_type: ClientType = "openai_chat_completions"
preserve_all_thinking: bool = False
preserve_thinking_between_tool_calls: bool = False
renderer_config: RendererConfig | None = None
api_key_var: str = "PRIME_API_KEY"
api_base_url: str = "https://api.pinference.ai/api/v1"
endpoint_configs: list[EndpointClientConfig] = []
Expand All @@ -1051,7 +1050,7 @@ class ClientConfig(BaseModel):

`client_type` selects which `Client` implementation to instantiate (see [Client Classes](#client-classes)). Use `endpoint_configs` for multi-endpoint round-robin. In grouped scoring mode, groups are distributed round-robin across endpoint configs.

`preserve_all_thinking` and `preserve_thinking_between_tool_calls` are forwarded to the underlying renderer when `client_type == "renderer"`. They control whether past-assistant `reasoning_content` is re-emitted on subsequent renders — `preserve_all_thinking` keeps every past-assistant turn's thinking, and `preserve_thinking_between_tool_calls` keeps thinking only inside the in-flight assistant→tool→…→assistant block after the most recent user turn (when that block contains at least one tool response). Both default to `False` (template default applies).
`renderer_config` is a typed `renderers.RendererConfig` that drives the renderer pool when `client_type == "renderer"` (`None` is treated as `AutoRendererConfig()`). Its shared `thinking_retention` field controls whether past-assistant `reasoning_content` is re-emitted on subsequent renders: `"template"` (default) defers to the chat template, `"tool_cycle"` additionally keeps thinking inside the in-flight assistant→tool→…→assistant block after the most recent user turn (when that block contains at least one tool response), and `"all"` keeps every past-assistant turn's thinking.

When `api_key_var` is `"PRIME_API_KEY"` (the default), credentials are loaded with the following precedence:
- **API key**: `PRIME_API_KEY` env var > `~/.prime/config.json` > `"EMPTY"`
Expand Down
10 changes: 5 additions & 5 deletions tests/test_renderer_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_renderer_client_threads_chat_template_kwargs_into_pool():

bases = [
Qwen3RendererConfig(enable_thinking=True),
AutoRendererConfig(preserve_all_thinking=True),
AutoRendererConfig(thinking_retention="all"),
None,
]
for base in bases:
Expand Down Expand Up @@ -134,16 +134,16 @@ async def _fake_generate(**kwargs):
)
)

expected_preserve_all = (
base.preserve_all_thinking
expected_retention = (
base.thinking_retention
if isinstance(base, AutoRendererConfig)
else False
else "template"
)
create_pool_mock.assert_called_once_with(
"Qwen/Qwen3-8B",
Qwen3RendererConfig(
enable_thinking=False,
preserve_all_thinking=expected_preserve_all,
thinking_retention=expected_retention,
),
size=1,
)
Expand Down
11 changes: 4 additions & 7 deletions verifiers/clients/renderer_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,8 @@ def _resolve_renderer_config(
inside ``renderers.create_renderer``), we pull resolution forward via
``MODEL_RENDERER_MAP`` so kwargs land on the concrete config variant
and pydantic validates them against the actual renderer's schema —
``AutoRendererConfig`` intentionally carries only ``preserve_*`` and
would reject template kwargs like ``enable_thinking``. ``renderer_model``
``AutoRendererConfig`` intentionally carries only ``thinking_retention``
and would reject template kwargs like ``enable_thinking``. ``renderer_model``
must match what the pool will tokenize with (i.e.
``ClientConfig.renderer_model_name`` when set, else the request model),
so resolution agrees with the tokenizer the renderer will hold.
Expand All @@ -408,7 +408,7 @@ def _resolve_renderer_config(

# Resolve auto → concrete (mirrors ``renderers._resolve_auto``) so
# ``enable_thinking`` etc. validate against the right schema instead of
# ``AutoRendererConfig``'s minimal one. Carries ``preserve_*`` across.
# ``AutoRendererConfig``'s minimal one. Carries ``thinking_retention`` across.
if base is None or isinstance(base, AutoRendererConfig):
renderer_name = MODEL_RENDERER_MAP.get(renderer_model, "default")
# ``config_from_name`` returns ``None`` only for ``"auto"``, which
Expand All @@ -417,10 +417,7 @@ def _resolve_renderer_config(
assert concrete is not None
if isinstance(base, AutoRendererConfig):
concrete = concrete.model_copy(
update={
"preserve_all_thinking": base.preserve_all_thinking,
"preserve_thinking_between_tool_calls": base.preserve_thinking_between_tool_calls,
}
update={"thinking_retention": base.thinking_retention}
)
base = cast(RendererConfig, concrete)

Expand Down
Loading