diff --git a/pyproject.toml b/pyproject.toml index 478104f..064d34f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,11 @@ dependencies = [ "openai>=1.108.1", "tiktoken", "jinja2", + # HuggingFace's Rust BPE library. ``_get_offset_tokenizer`` uses + # ``tokenizers.Tokenizer.from_pretrained`` for offset-aware encoding + # (body/scaffold attribution on the render path) — keeps the heavy + # ``transformers`` framework off the offset path for most models. + "tokenizers>=0.20", "transformers>=4.50.0", # Used by GptOssRenderer to render and parse harmony tokens. Vendoring # OpenAI's reference implementation keeps us byte-identical with vLLM diff --git a/renderers/base.py b/renderers/base.py index 45768de..7928d93 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1635,42 +1635,80 @@ def trim_to_turn_close( return previous_ids -# Per-model offset-aware tokenizer cache. ``attribute_text_segments`` -# uses the fast HuggingFace tokenizer's ``offset_mapping`` to attribute -# each token to its source text segment under one BPE pass. Fastokens -# (the Rust BPE we patch in by default for ~10x faster encode) does not -# track character offsets — the patched tokenizer's -# ``return_offsets_mapping=True`` raises ``NotImplementedError``. So we -# keep a parallel vanilla tokenizer per model purely for offset queries. -# Memory cost is one extra tokenizer per *unique* model name across all -# pools / renderers (the cache is process-global), independent of pool -# size. -_offset_tokenizers: dict[str, Any] = {} +# Per-model offset-aware ``tokenizers.Tokenizer`` cache. Renderers whose +# ``emit_text_segments`` block has mixed ``is_content`` labels (and +# minimax_m2's ``emit_token_overlap_body``) need character offsets to +# attribute each joined-encode token back to its source segment. We use +# the ``tokenizers`` package directly — its native ``Encoding.offsets`` +# is exactly what we want and ``transformers`` is not required. +# ``tokenizers.Tokenizer.from_pretrained`` just downloads +# ``tokenizer.json`` via ``huggingface_hub``; no model config build, +# no remote-code execution. Fastokens (the Rust BPE that +# ``load_tokenizer`` patches in by default) substitutes a non-offset +# backend into ``PreTrainedTokenizerFast``, so when we detect a +# fastokens-patched (or other non-offset) backend we fall back to +# loading a separate vanilla ``tokenizers.Tokenizer`` keyed by +# ``name_or_path``. Cache memory cost is one extra tokenizer per +# *unique* model name across all pools / renderers (process-global), +# independent of pool size. +_offset_tokenizers: "dict[str, Any]" = {} _offset_tokenizers_lock = threading.Lock() -def _get_offset_tokenizer(tokenizer): - """Return a tokenizer that supports ``return_offsets_mapping=True``. +_OFFSET_PROBE_TEXT = "Hello, world.\n\n# Test" +"""Probe string used to verify a loaded ``tokenizers.Tokenizer`` matches +the user's tokenizer. Spans the ``.\\n\\n`` boundary because some models +(MiniMax-M2.5's ``GPT2Tokenizer`` wrapper) ship a ``tokenizer.json`` +whose pre_tokenizer disagrees with what ``transformers.AutoTokenizer`` +applies at construction — that's where they diverge.""" + - If ``tokenizer`` itself supports offsets, returns it unchanged. - Otherwise loads a vanilla (non-fastokens) tokenizer from - ``tokenizer.name_or_path`` and caches it. Raises if the tokenizer - has no usable ``name_or_path`` — hand-coded renderers always pass - a tokenizer loaded via ``load_tokenizer`` which does set it. +def _get_offset_tokenizer(tokenizer): + """Return a ``tokenizers.Tokenizer`` for offset-aware encoding. + + Resolution order: + + 1. If ``tokenizer`` is already a ``tokenizers.Tokenizer``, return + it as-is (BYO offset-capable tokenizer — no extra load). + 2. If ``tokenizer.backend_tokenizer`` is a vanilla + ``tokenizers.Tokenizer`` (vanilla ``PreTrainedTokenizerFast``, + not fastokens-patched), use it directly — no extra load. + 3. Load via ``tokenizers.Tokenizer.from_pretrained(name_or_path)`` + and verify it encodes a probe string to the same ids as the + user's tokenizer. If they match, cache and use it. Pinned + ``TRUSTED_REVISIONS`` are honoured. + 4. If the bare load diverges from the user's tokenizer, fall back + to ``transformers.AutoTokenizer`` and pull out *its* backend — + some models (MiniMax-M2.5) ship a ``tokenizer.json`` whose + pre_tokenizer disagrees with the AutoTokenizer-applied backend + mutations, so the bare load is incorrect for them. This is the + only branch that needs ``transformers``; we surface a clear + ``[transformers]`` extra hint if it's not installed. + + Most models clear path 3 with no extra load and no ``transformers`` + dependency. """ - # Cheap probe: does this tokenizer already provide offsets? - try: - tokenizer("a", add_special_tokens=False, return_offsets_mapping=True) + from tokenizers import Tokenizer as RustTokenizer + + # Path 1: already a tokenizers.Tokenizer. + if isinstance(tokenizer, RustTokenizer): return tokenizer - except (NotImplementedError, ValueError, TypeError): - pass + + # Path 2: vanilla PreTrainedTokenizerFast exposes its underlying + # tokenizers.Tokenizer via ``backend_tokenizer``. Fastokens + # replaces that with a shim whose isinstance check fails — caught + # here, falls through. + backend = getattr(tokenizer, "backend_tokenizer", None) + if isinstance(backend, RustTokenizer): + return backend name_or_path = getattr(tokenizer, "name_or_path", "") if not name_or_path: raise RuntimeError( "Cannot construct an offset-aware tokenizer: the supplied " "tokenizer has no ``name_or_path`` to fall back on. Pass a " - "tokenizer loaded via ``renderers.base.load_tokenizer``." + "tokenizer loaded via ``renderers.base.load_tokenizer`` or " + "a ``tokenizers.Tokenizer`` directly." ) with _offset_tokenizers_lock: @@ -1678,26 +1716,62 @@ def _get_offset_tokenizer(tokenizer): if cached is not None: return cached - kwargs: dict[str, Any] = {} revision = TRUSTED_REVISIONS.get(name_or_path) + + # Path 3: bare ``tokenizers.Tokenizer.from_pretrained`` — works + # for almost all supported models, no ``transformers`` needed. if revision is not None: - kwargs = {"trust_remote_code": True, "revision": revision} + candidate = RustTokenizer.from_pretrained(name_or_path, revision=revision) else: - kwargs = {"trust_remote_code": False} - # Explicitly vanilla — we want HF's Rust tokenizer with offset - # tracking, not the fastokens shim. ``load_tokenizer`` would - # patch fastokens in by default; routing through - # ``_load_tokenizer_via_auto`` keeps the fastokens patch out - # of this code path while still applying the config-build - # fallback (RoPE-validation failures on nested - # ``rope_parameters``, etc.). - offset_tok = _load_tokenizer_via_auto(name_or_path, **kwargs) - if not getattr(offset_tok, "is_fast", False): + candidate = RustTokenizer.from_pretrained(name_or_path) + + # Verify equivalence with the user's tokenizer on a probe that + # spans known boundary cases. + try: + user_ids = list( + tokenizer.encode(_OFFSET_PROBE_TEXT, add_special_tokens=False) + ) + candidate_ids = list( + candidate.encode(_OFFSET_PROBE_TEXT, add_special_tokens=False).ids + ) + except Exception: + user_ids = None + candidate_ids = None + + if user_ids is not None and user_ids == candidate_ids: + _offset_tokenizers[name_or_path] = candidate + return candidate + + # Path 4: bare load diverges from the user's tokenizer. + # ``AutoTokenizer`` mutates the backend at construction (e.g. + # substituting a ByteLevel pre_tokenizer); replicate by routing + # through it and pulling out the now-correct backend. Requires + # the optional ``transformers`` extra. + try: + from transformers import AutoTokenizer + except ImportError as exc: + raise ImportError( + f"Loading an offset-aware tokenizer for {name_or_path!r} via " + f"the bare ``tokenizers`` library produced a token stream " + f"that doesn't match the user's tokenizer (this happens for " + f"models whose ``AutoTokenizer`` mutates the backend at " + f"load, e.g. MiniMax). Install the optional ``transformers`` " + f"extra to enable the AutoTokenizer fallback: " + f"``pip install renderers[transformers]``." + ) from exc + + kwargs: "dict[str, Any]" = ( + {"trust_remote_code": True, "revision": revision} + if revision is not None + else {"trust_remote_code": False} + ) + hf_tok = AutoTokenizer.from_pretrained(name_or_path, **kwargs) + offset_tok = getattr(hf_tok, "backend_tokenizer", None) + if not isinstance(offset_tok, RustTokenizer): raise RuntimeError( - f"Vanilla tokenizer for {name_or_path!r} is not a fast " - "tokenizer; offset_mapping is unavailable. Hand-coded " - "renderers require a fast tokenizer for body/scaffold " - "attribution." + f"AutoTokenizer.from_pretrained({name_or_path!r}) did not " + f"expose a ``tokenizers.Tokenizer`` backend; offset-aware " + f"encoding is unavailable for this model." ) _offset_tokenizers[name_or_path] = offset_tok return offset_tok @@ -1715,23 +1789,19 @@ def attribute_text_segments( (content, True)]`` for a user message. Concatenation is done before encoding to preserve BPE merges across the wrap/body boundary; the resulting tokens are then attributed back to their source segment - via the fast tokenizer's ``offset_mapping``. + via ``tokenizers.Encoding.offsets``. A token is attributed to the segment containing its first source - character (``offset_mapping[k][0]``). Tokens whose first character - falls exactly on a segment boundary are attributed to the segment - that *starts* at that offset (the "later" segment). Zero-length - tokens (rare; usually pre-tokenizer artefacts) are attributed to - the most recently entered segment. - - Requires a HuggingFace fast tokenizer with offset tracking. The - ``fastokens`` patch ``load_tokenizer`` applies by default does - **not** track offsets — when that's the case we transparently load - a vanilla offset-capable tokenizer for the same model and cache it - (see :func:`_get_offset_tokenizer`). Hand-coded renderers are only - registered for model families that ship a fast tokenizer, so a - silent slow-tokenizer fallback isn't supported — BPE drift at the - wrap/body boundary would defeat the whole point. + character (``offsets[k][0]``). Tokens whose first character falls + exactly on a segment boundary go to the segment that *starts* at + that offset (the "later" segment). Zero-length tokens (rare; + pre-tokenizer artefacts) are attributed to the most recently + entered segment. + + Uses ``tokenizers`` directly via :func:`_get_offset_tokenizer`; no + ``transformers`` dependency. Hand-coded renderers register only + for model families whose ``tokenizer.json`` ships a fast Rust + tokenizer, so the offset lookup always succeeds. Empty input or empty joined text returns an empty list. """ @@ -1742,47 +1812,34 @@ def attribute_text_segments( return [] offset_tokenizer = _get_offset_tokenizer(tokenizer) - encoding = offset_tokenizer( - full_text, - add_special_tokens=False, - return_offsets_mapping=True, - ) - token_ids = list(encoding["input_ids"]) - offsets = list(encoding["offset_mapping"]) + encoding = offset_tokenizer.encode(full_text, add_special_tokens=False) + token_ids = list(encoding.ids) + offsets = list(encoding.offsets) # Build segment char-span lookup. Track the half-open span # [seg_start, seg_end) of each segment and its is_content bit. - spans: list[tuple[int, int, bool]] = [] + spans: "list[tuple[int, int, bool]]" = [] pos = 0 for text, is_content in segments: spans.append((pos, pos + len(text), is_content)) pos += len(text) total_len = pos - out: list[tuple[int, bool]] = [] + out: "list[tuple[int, bool]]" = [] last_is_content = spans[-1][2] if spans else False for tok_id, (start, _end) in zip(token_ids, offsets): if start >= total_len: - # Token's character offset is past every segment (shouldn't - # normally happen for add_special_tokens=False, but defensive - # against tokenizer-specific edge cases). + # Token's char offset is past every segment (shouldn't + # normally happen for add_special_tokens=False, but defensive). out.append((tok_id, last_is_content)) continue - # Find the segment that contains `start`. Segments are - # contiguous and ordered, so a linear scan is fine — the inner - # loop runs at most len(segments) times per token and segments + # Find the segment that contains `start`. Linear scan — segments # is typically 2-3 in practice. is_content = last_is_content for seg_start, seg_end, seg_is_content in spans: if seg_start <= start < seg_end: is_content = seg_is_content break - else: - # start == total_len handled above; the remaining case is - # an empty segment in the middle. Empty segments emit no - # characters, so no token can land in them; fall through to - # the last non-empty segment's bit. - pass out.append((tok_id, is_content)) return out diff --git a/renderers/deepseek_v3.py b/renderers/deepseek_v3.py index 7bec3de..77e5837 100644 --- a/renderers/deepseek_v3.py +++ b/renderers/deepseek_v3.py @@ -17,11 +17,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, trim_to_turn_close, @@ -148,8 +148,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) diff --git a/renderers/glm45.py b/renderers/glm45.py index 7af9259..73ff601 100644 --- a/renderers/glm45.py +++ b/renderers/glm45.py @@ -16,11 +16,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -146,15 +146,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: - """Tokenize concatenated segments as one BPE pass; per-token - ``is_content`` follows each token's source segment. - - Lets call sites express "this wrap + this body, joined the - same way as the chat template, but attributed separately" - without splitting the encode call (which could shift BPE - merges at the boundary).""" + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -377,8 +387,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/glm5.py b/renderers/glm5.py index 924d754..bd344e7 100644 --- a/renderers/glm5.py +++ b/renderers/glm5.py @@ -17,11 +17,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -166,15 +166,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: - """Tokenize concatenated segments as one BPE pass; per-token - ``is_content`` follows each token's source segment. - - Lets call sites express "this wrap + this body, joined the - same way as the chat template, but attributed separately" - without splitting the encode call (which could shift BPE - merges at the boundary).""" + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -397,8 +407,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/laguna_xs2.py b/renderers/laguna_xs2.py index bd6b64f..583b7aa 100644 --- a/renderers/laguna_xs2.py +++ b/renderers/laguna_xs2.py @@ -30,12 +30,12 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Content, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, ) @@ -169,8 +169,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -382,8 +399,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/minimax_m2.py b/renderers/minimax_m2.py index f990274..d690c70 100644 --- a/renderers/minimax_m2.py +++ b/renderers/minimax_m2.py @@ -17,11 +17,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -133,8 +133,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -152,23 +169,22 @@ def emit_token_overlap_body( """Tokenize ``full_text`` and mark tokens that overlap the body char span as ``is_content=True``. - Differs from :func:`attribute_text_segments` only in the - boundary-token rule: a token straddling scaffold→body gets - ``True`` if any of its bytes are body bytes (overlap rule), - rather than being attributed to whichever segment its first - char belongs to. The body's first byte is preserved even when - BPE merges it with the wrap's trailing byte (``>The`` → - single token). + Uses an "intersects body span" rule: a token straddling + scaffold→body gets ``True`` if any of its bytes are body + bytes, rather than being attributed to whichever segment its + first char belongs to. The body's first byte is preserved + even when BPE merges it with the wrap's trailing byte + (``>The`` → single token). The other renderers don't need + this because their scaffolds break at characters BPE + doesn't merge across (``\\n``, special tokens); the + ``...`` template here glues scaffold and body + with no separator. """ from renderers.base import _get_offset_tokenizer offset_tok = _get_offset_tokenizer(self._tokenizer) - encoding = offset_tok( - full_text, add_special_tokens=False, return_offsets_mapping=True - ) - for tok_id, (start, end) in zip( - encoding["input_ids"], encoding["offset_mapping"] - ): + encoding = offset_tok.encode(full_text, add_special_tokens=False) + for tok_id, (start, end) in zip(encoding.ids, encoding.offsets): overlaps = start < body_end and end > body_start tokens.append(tok_id) indices.append(msg_idx) @@ -381,8 +397,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) @@ -627,15 +660,13 @@ def _render_tool( # ```` is plain text with no separator between the # closing ``>`` and ``content``'s first byte, so BPE can merge - # them into a single token (e.g., ``>The``). The shared - # ``attribute_text_segments`` helper picks the segment of a - # boundary-spanning token by its *first* char (here scaffold), - # which would drop the body's leading letter out of the body - # run. We instead use an "intersects body" rule: any token whose - # ``[start, end)`` char range overlaps the body span gets + # them into a single token (e.g., ``>The``). A "first char + # wins" rule would drop the body's leading letter out of the + # body run. We instead use an "intersects body" rule: any token + # whose ``[start, end)`` char range overlaps the body span gets # ``is_content=True``. A few scaffold bytes (the leading ``>`` - # or trailing ``<``) bleed into the body run, but body bytes are - # recoverable as a substring of the decoded body span. + # or trailing ``<``) bleed into the body run, but body bytes + # are recoverable as a substring of the decoded body span. body_text = prefix + "" + content + "" + suffix body_start = len(prefix) + len("") body_end = body_start + len(content) diff --git a/renderers/nemotron3.py b/renderers/nemotron3.py index e6398b5..6664a42 100644 --- a/renderers/nemotron3.py +++ b/renderers/nemotron3.py @@ -20,11 +20,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -268,8 +268,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -523,8 +540,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/qwen3.py b/renderers/qwen3.py index f744b8c..358765a 100644 --- a/renderers/qwen3.py +++ b/renderers/qwen3.py @@ -14,11 +14,11 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, ParsedResponse, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -126,15 +126,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: - """Tokenize concatenated segments as one BPE pass; per-token - ``is_content`` follows each token's source segment. - - Lets call sites express "this wrap + this body, joined the - same way as the chat template, but attributed separately" - without splitting the encode call (which could shift BPE - merges at the boundary).""" + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -349,8 +359,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries. for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): ext.append(tok_id) ext_indices.append(msg_idx) diff --git a/renderers/qwen35.py b/renderers/qwen35.py index cdb8ee1..5d95478 100644 --- a/renderers/qwen35.py +++ b/renderers/qwen35.py @@ -20,13 +20,13 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, MultiModalData, ParsedResponse, PlaceholderRange, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, should_preserve_past_thinking, @@ -341,15 +341,25 @@ def emit_text( def emit_text_segments( segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool ) -> None: - """Tokenize concatenated segments as one BPE pass; per-token - ``is_content`` follows each token's source segment. - - Lets call sites express "this wrap + this body, joined the - same way as the chat template, but attributed separately" - without splitting the encode call (which could shift BPE - merges at the boundary).""" + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) @@ -706,8 +716,25 @@ def emit_text_segments( *, is_sampled: bool = False, ) -> None: + collapsed: list[tuple[str, bool]] = [] + for text, label in segments: + if not text: + continue + if collapsed and collapsed[-1][1] == label: + collapsed[-1] = (collapsed[-1][0] + text, label) + else: + collapsed.append((text, label)) + if not collapsed: + return + if len(collapsed) == 1: + # Homogeneous — single joined encode preserves all BPE merges. + text, label = collapsed[0] + emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label) + return + # Mixed labels remain — joined encode + offset attribution handles + # BPE merges across label-transition boundaries (e.g., ``.\n\n``). for tok_id, is_content in attribute_text_segments( - self._tokenizer, segments + self._tokenizer, collapsed ): tokens.append(tok_id) indices.append(msg_idx) diff --git a/renderers/qwen3_vl.py b/renderers/qwen3_vl.py index 9a4ffde..184d5fe 100644 --- a/renderers/qwen3_vl.py +++ b/renderers/qwen3_vl.py @@ -36,13 +36,13 @@ from transformers.tokenization_utils import PreTrainedTokenizer from renderers.base import ( + attribute_text_segments, Message, MultiModalData, ParsedResponse, PlaceholderRange, RenderedTokens, ToolSpec, - attribute_text_segments, extract_message_tool_names, reject_assistant_in_extension, trim_to_turn_close, @@ -223,11 +223,10 @@ def text(self, text: str, *, is_sampled: bool, is_content: bool) -> None: if not text: return # Adjacent text under different msg_idx or is_sampled is rare in - # this template — but flush at those boundaries so attribution - # and the sampled signal stay accurate. is_content boundaries do - # NOT force a flush: they're carried through the joined BPE pass - # via :func:`attribute_text_segments`, preserving merges across - # the wrap/body boundary. + # this template — but flush at those boundaries so the sampled + # signal stays accurate. is_content boundaries do NOT force a + # flush: mixed-is_content flushes encode each segment + # independently (see ``_flush``). if self._segments and ( self._buf_idx != self.msg_idx or self._buf_sampled != is_sampled ): @@ -274,13 +273,11 @@ def _flush(self) -> None: self.sampled.extend([self._buf_sampled] * len(ids)) self.is_content.extend([first_ic] * len(ids)) return - # Mixed body/scaffold flush — encode once and attribute back to - # each segment via the fast tokenizer's offset_mapping. Requires - # a tokenizer (not just the encode fn) to look up offsets. - assert self._tokenizer is not None, ( - "_Emitter mixed-is_content flush requires a tokenizer; " - "pass one to the constructor." - ) + # Mixed body/scaffold flush — joined encode + offset attribution + # preserves BPE merges across the label-transition boundary + # (e.g., ``"user\n"`` scaffold ↔ caller body, where a trailing + # char of the body could merge with the leading scaffold byte + # of the next segment). for tok_id, is_content in attribute_text_segments(self._tokenizer, segments): self.token_ids.append(tok_id) self.message_indices.append(self._buf_idx)