Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ dependencies = [
"openai>=1.108.1",
"tiktoken",
"jinja2",
# HuggingFace's Rust BPE library. ``_get_offset_tokenizer`` uses
# ``tokenizers.Tokenizer.from_pretrained`` for offset-aware encoding
# (body/scaffold attribution on the render path) — keeps the heavy
# ``transformers`` framework off the offset path for most models.
"tokenizers>=0.20",
"transformers>=4.50.0",
# Used by GptOssRenderer to render and parse harmony tokens. Vendoring
# OpenAI's reference implementation keeps us byte-identical with vLLM
Expand Down
209 changes: 133 additions & 76 deletions renderers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1635,69 +1635,143 @@ def trim_to_turn_close(
return previous_ids


# Per-model offset-aware tokenizer cache. ``attribute_text_segments``
# uses the fast HuggingFace tokenizer's ``offset_mapping`` to attribute
# each token to its source text segment under one BPE pass. Fastokens
# (the Rust BPE we patch in by default for ~10x faster encode) does not
# track character offsets — the patched tokenizer's
# ``return_offsets_mapping=True`` raises ``NotImplementedError``. So we
# keep a parallel vanilla tokenizer per model purely for offset queries.
# Memory cost is one extra tokenizer per *unique* model name across all
# pools / renderers (the cache is process-global), independent of pool
# size.
_offset_tokenizers: dict[str, Any] = {}
# Per-model offset-aware ``tokenizers.Tokenizer`` cache. Renderers whose
# ``emit_text_segments`` block has mixed ``is_content`` labels (and
# minimax_m2's ``emit_token_overlap_body``) need character offsets to
# attribute each joined-encode token back to its source segment. We use
# the ``tokenizers`` package directly — its native ``Encoding.offsets``
# is exactly what we want and ``transformers`` is not required.
# ``tokenizers.Tokenizer.from_pretrained`` just downloads
# ``tokenizer.json`` via ``huggingface_hub``; no model config build,
# no remote-code execution. Fastokens (the Rust BPE that
# ``load_tokenizer`` patches in by default) substitutes a non-offset
# backend into ``PreTrainedTokenizerFast``, so when we detect a
# fastokens-patched (or other non-offset) backend we fall back to
# loading a separate vanilla ``tokenizers.Tokenizer`` keyed by
# ``name_or_path``. Cache memory cost is one extra tokenizer per
# *unique* model name across all pools / renderers (process-global),
# independent of pool size.
_offset_tokenizers: "dict[str, Any]" = {}
_offset_tokenizers_lock = threading.Lock()


def _get_offset_tokenizer(tokenizer):
"""Return a tokenizer that supports ``return_offsets_mapping=True``.
_OFFSET_PROBE_TEXT = "Hello, world.\n\n# Test"
"""Probe string used to verify a loaded ``tokenizers.Tokenizer`` matches
the user's tokenizer. Spans the ``.\\n\\n`` boundary because some models
(MiniMax-M2.5's ``GPT2Tokenizer`` wrapper) ship a ``tokenizer.json``
whose pre_tokenizer disagrees with what ``transformers.AutoTokenizer``
applies at construction — that's where they diverge."""


If ``tokenizer`` itself supports offsets, returns it unchanged.
Otherwise loads a vanilla (non-fastokens) tokenizer from
``tokenizer.name_or_path`` and caches it. Raises if the tokenizer
has no usable ``name_or_path`` — hand-coded renderers always pass
a tokenizer loaded via ``load_tokenizer`` which does set it.
def _get_offset_tokenizer(tokenizer):
"""Return a ``tokenizers.Tokenizer`` for offset-aware encoding.

Resolution order:

1. If ``tokenizer`` is already a ``tokenizers.Tokenizer``, return
it as-is (BYO offset-capable tokenizer — no extra load).
2. If ``tokenizer.backend_tokenizer`` is a vanilla
``tokenizers.Tokenizer`` (vanilla ``PreTrainedTokenizerFast``,
not fastokens-patched), use it directly — no extra load.
3. Load via ``tokenizers.Tokenizer.from_pretrained(name_or_path)``
and verify it encodes a probe string to the same ids as the
user's tokenizer. If they match, cache and use it. Pinned
``TRUSTED_REVISIONS`` are honoured.
4. If the bare load diverges from the user's tokenizer, fall back
to ``transformers.AutoTokenizer`` and pull out *its* backend —
some models (MiniMax-M2.5) ship a ``tokenizer.json`` whose
pre_tokenizer disagrees with the AutoTokenizer-applied backend
mutations, so the bare load is incorrect for them. This is the
only branch that needs ``transformers``; we surface a clear
``[transformers]`` extra hint if it's not installed.

Most models clear path 3 with no extra load and no ``transformers``
dependency.
"""
# Cheap probe: does this tokenizer already provide offsets?
try:
tokenizer("a", add_special_tokens=False, return_offsets_mapping=True)
from tokenizers import Tokenizer as RustTokenizer

# Path 1: already a tokenizers.Tokenizer.
if isinstance(tokenizer, RustTokenizer):
return tokenizer
except (NotImplementedError, ValueError, TypeError):
pass

# Path 2: vanilla PreTrainedTokenizerFast exposes its underlying
# tokenizers.Tokenizer via ``backend_tokenizer``. Fastokens
# replaces that with a shim whose isinstance check fails — caught
# here, falls through.
backend = getattr(tokenizer, "backend_tokenizer", None)
if isinstance(backend, RustTokenizer):
return backend

name_or_path = getattr(tokenizer, "name_or_path", "")
if not name_or_path:
raise RuntimeError(
"Cannot construct an offset-aware tokenizer: the supplied "
"tokenizer has no ``name_or_path`` to fall back on. Pass a "
"tokenizer loaded via ``renderers.base.load_tokenizer``."
"tokenizer loaded via ``renderers.base.load_tokenizer`` or "
"a ``tokenizers.Tokenizer`` directly."
)

with _offset_tokenizers_lock:
cached = _offset_tokenizers.get(name_or_path)
if cached is not None:
return cached

kwargs: dict[str, Any] = {}
revision = TRUSTED_REVISIONS.get(name_or_path)

# Path 3: bare ``tokenizers.Tokenizer.from_pretrained`` — works
# for almost all supported models, no ``transformers`` needed.
if revision is not None:
kwargs = {"trust_remote_code": True, "revision": revision}
candidate = RustTokenizer.from_pretrained(name_or_path, revision=revision)
else:
kwargs = {"trust_remote_code": False}
# Explicitly vanilla — we want HF's Rust tokenizer with offset
# tracking, not the fastokens shim. ``load_tokenizer`` would
# patch fastokens in by default; routing through
# ``_load_tokenizer_via_auto`` keeps the fastokens patch out
# of this code path while still applying the config-build
# fallback (RoPE-validation failures on nested
# ``rope_parameters``, etc.).
offset_tok = _load_tokenizer_via_auto(name_or_path, **kwargs)
if not getattr(offset_tok, "is_fast", False):
candidate = RustTokenizer.from_pretrained(name_or_path)

# Verify equivalence with the user's tokenizer on a probe that
# spans known boundary cases.
try:
user_ids = list(
tokenizer.encode(_OFFSET_PROBE_TEXT, add_special_tokens=False)
)
candidate_ids = list(
candidate.encode(_OFFSET_PROBE_TEXT, add_special_tokens=False).ids
)
except Exception:
user_ids = None
candidate_ids = None

if user_ids is not None and user_ids == candidate_ids:
_offset_tokenizers[name_or_path] = candidate
return candidate

# Path 4: bare load diverges from the user's tokenizer.
# ``AutoTokenizer`` mutates the backend at construction (e.g.
# substituting a ByteLevel pre_tokenizer); replicate by routing
# through it and pulling out the now-correct backend. Requires
# the optional ``transformers`` extra.
try:
from transformers import AutoTokenizer
except ImportError as exc:
raise ImportError(
f"Loading an offset-aware tokenizer for {name_or_path!r} via "
f"the bare ``tokenizers`` library produced a token stream "
f"that doesn't match the user's tokenizer (this happens for "
f"models whose ``AutoTokenizer`` mutates the backend at "
f"load, e.g. MiniMax). Install the optional ``transformers`` "
f"extra to enable the AutoTokenizer fallback: "
f"``pip install renderers[transformers]``."
) from exc

kwargs: "dict[str, Any]" = (
{"trust_remote_code": True, "revision": revision}
if revision is not None
else {"trust_remote_code": False}
)
hf_tok = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
offset_tok = getattr(hf_tok, "backend_tokenizer", None)
if not isinstance(offset_tok, RustTokenizer):
raise RuntimeError(
f"Vanilla tokenizer for {name_or_path!r} is not a fast "
"tokenizer; offset_mapping is unavailable. Hand-coded "
"renderers require a fast tokenizer for body/scaffold "
"attribution."
f"AutoTokenizer.from_pretrained({name_or_path!r}) did not "
f"expose a ``tokenizers.Tokenizer`` backend; offset-aware "
f"encoding is unavailable for this model."
)
_offset_tokenizers[name_or_path] = offset_tok
return offset_tok
Expand All @@ -1715,23 +1789,19 @@ def attribute_text_segments(
(content, True)]`` for a user message. Concatenation is done before
encoding to preserve BPE merges across the wrap/body boundary; the
resulting tokens are then attributed back to their source segment
via the fast tokenizer's ``offset_mapping``.
via ``tokenizers.Encoding.offsets``.

A token is attributed to the segment containing its first source
character (``offset_mapping[k][0]``). Tokens whose first character
falls exactly on a segment boundary are attributed to the segment
that *starts* at that offset (the "later" segment). Zero-length
tokens (rare; usually pre-tokenizer artefacts) are attributed to
the most recently entered segment.

Requires a HuggingFace fast tokenizer with offset tracking. The
``fastokens`` patch ``load_tokenizer`` applies by default does
**not** track offsets — when that's the case we transparently load
a vanilla offset-capable tokenizer for the same model and cache it
(see :func:`_get_offset_tokenizer`). Hand-coded renderers are only
registered for model families that ship a fast tokenizer, so a
silent slow-tokenizer fallback isn't supported — BPE drift at the
wrap/body boundary would defeat the whole point.
character (``offsets[k][0]``). Tokens whose first character falls
exactly on a segment boundary go to the segment that *starts* at
that offset (the "later" segment). Zero-length tokens (rare;
pre-tokenizer artefacts) are attributed to the most recently
entered segment.

Uses ``tokenizers`` directly via :func:`_get_offset_tokenizer`; no
``transformers`` dependency. Hand-coded renderers register only
for model families whose ``tokenizer.json`` ships a fast Rust
tokenizer, so the offset lookup always succeeds.

Empty input or empty joined text returns an empty list.
"""
Expand All @@ -1742,47 +1812,34 @@ def attribute_text_segments(
return []

offset_tokenizer = _get_offset_tokenizer(tokenizer)
encoding = offset_tokenizer(
full_text,
add_special_tokens=False,
return_offsets_mapping=True,
)
token_ids = list(encoding["input_ids"])
offsets = list(encoding["offset_mapping"])
encoding = offset_tokenizer.encode(full_text, add_special_tokens=False)
token_ids = list(encoding.ids)
offsets = list(encoding.offsets)

# Build segment char-span lookup. Track the half-open span
# [seg_start, seg_end) of each segment and its is_content bit.
spans: list[tuple[int, int, bool]] = []
spans: "list[tuple[int, int, bool]]" = []
pos = 0
for text, is_content in segments:
spans.append((pos, pos + len(text), is_content))
pos += len(text)
total_len = pos

out: list[tuple[int, bool]] = []
out: "list[tuple[int, bool]]" = []
last_is_content = spans[-1][2] if spans else False
for tok_id, (start, _end) in zip(token_ids, offsets):
if start >= total_len:
# Token's character offset is past every segment (shouldn't
# normally happen for add_special_tokens=False, but defensive
# against tokenizer-specific edge cases).
# Token's char offset is past every segment (shouldn't
# normally happen for add_special_tokens=False, but defensive).
out.append((tok_id, last_is_content))
continue
# Find the segment that contains `start`. Segments are
# contiguous and ordered, so a linear scan is fine — the inner
# loop runs at most len(segments) times per token and segments
# Find the segment that contains `start`. Linear scan — segments
# is typically 2-3 in practice.
is_content = last_is_content
for seg_start, seg_end, seg_is_content in spans:
if seg_start <= start < seg_end:
is_content = seg_is_content
break
else:
# start == total_len handled above; the remaining case is
# an empty segment in the middle. Empty segments emit no
# characters, so no token can land in them; fall through to
# the last non-empty segment's bit.
pass
out.append((tok_id, is_content))
return out

Expand Down
21 changes: 19 additions & 2 deletions renderers/deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
from transformers.tokenization_utils import PreTrainedTokenizer

from renderers.base import (
attribute_text_segments,
Message,
ParsedResponse,
RenderedTokens,
ToolSpec,
attribute_text_segments,
extract_message_tool_names,
reject_assistant_in_extension,
trim_to_turn_close,
Expand Down Expand Up @@ -148,8 +148,25 @@ def emit_text(
def emit_text_segments(
segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool
) -> None:
collapsed: list[tuple[str, bool]] = []
for text, label in segments:
if not text:
continue
if collapsed and collapsed[-1][1] == label:
collapsed[-1] = (collapsed[-1][0] + text, label)
else:
collapsed.append((text, label))
if not collapsed:
return
if len(collapsed) == 1:
# Homogeneous — single joined encode preserves all BPE merges.
text, label = collapsed[0]
emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label)
return
# Mixed labels remain — joined encode + offset attribution handles
# BPE merges across label-transition boundaries (e.g., ``.\n\n``).
for tok_id, is_content in attribute_text_segments(
self._tokenizer, segments
self._tokenizer, collapsed
):
tokens.append(tok_id)
indices.append(msg_idx)
Expand Down
Loading