PrimeIntellect-ai · hallerite · Jun 17, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,11 @@ dependencies = [
     "openai>=1.108.1",
     "tiktoken",
     "jinja2",
+    # HuggingFace's Rust BPE library. ``_get_offset_tokenizer`` uses
+    # ``tokenizers.Tokenizer.from_pretrained`` for offset-aware encoding
+    # (body/scaffold attribution on the render path) — keeps the heavy
+    # ``transformers`` framework off the offset path for most models.
+    "tokenizers>=0.20",
     "transformers>=4.50.0",
     # Used by GptOssRenderer to render and parse harmony tokens. Vendoring
     # OpenAI's reference implementation keeps us byte-identical with vLLM

diff --git a/renderers/base.py b/renderers/base.py
@@ -1635,69 +1635,143 @@ def trim_to_turn_close(
     return previous_ids
 
 
-# Per-model offset-aware tokenizer cache. ``attribute_text_segments``
-# uses the fast HuggingFace tokenizer's ``offset_mapping`` to attribute
-# each token to its source text segment under one BPE pass. Fastokens
-# (the Rust BPE we patch in by default for ~10x faster encode) does not
-# track character offsets — the patched tokenizer's
-# ``return_offsets_mapping=True`` raises ``NotImplementedError``. So we
-# keep a parallel vanilla tokenizer per model purely for offset queries.
-# Memory cost is one extra tokenizer per *unique* model name across all
-# pools / renderers (the cache is process-global), independent of pool
-# size.
-_offset_tokenizers: dict[str, Any] = {}
+# Per-model offset-aware ``tokenizers.Tokenizer`` cache. Renderers whose
+# ``emit_text_segments`` block has mixed ``is_content`` labels (and
+# minimax_m2's ``emit_token_overlap_body``) need character offsets to
+# attribute each joined-encode token back to its source segment. We use
+# the ``tokenizers`` package directly — its native ``Encoding.offsets``
+# is exactly what we want and ``transformers`` is not required.
+# ``tokenizers.Tokenizer.from_pretrained`` just downloads
+# ``tokenizer.json`` via ``huggingface_hub``; no model config build,
+# no remote-code execution. Fastokens (the Rust BPE that
+# ``load_tokenizer`` patches in by default) substitutes a non-offset
+# backend into ``PreTrainedTokenizerFast``, so when we detect a
+# fastokens-patched (or other non-offset) backend we fall back to
+# loading a separate vanilla ``tokenizers.Tokenizer`` keyed by
+# ``name_or_path``. Cache memory cost is one extra tokenizer per
+# *unique* model name across all pools / renderers (process-global),
+# independent of pool size.
+_offset_tokenizers: "dict[str, Any]" = {}
 _offset_tokenizers_lock = threading.Lock()
 
 
-def _get_offset_tokenizer(tokenizer):
-    """Return a tokenizer that supports ``return_offsets_mapping=True``.
+_OFFSET_PROBE_TEXT = "Hello, world.\n\n# Test"
+"""Probe string used to verify a loaded ``tokenizers.Tokenizer`` matches
+the user's tokenizer. Spans the ``.\\n\\n`` boundary because some models
+(MiniMax-M2.5's ``GPT2Tokenizer`` wrapper) ship a ``tokenizer.json``
+whose pre_tokenizer disagrees with what ``transformers.AutoTokenizer``
+applies at construction — that's where they diverge."""
+
 
-    If ``tokenizer`` itself supports offsets, returns it unchanged.
-    Otherwise loads a vanilla (non-fastokens) tokenizer from
-    ``tokenizer.name_or_path`` and caches it. Raises if the tokenizer
-    has no usable ``name_or_path`` — hand-coded renderers always pass
-    a tokenizer loaded via ``load_tokenizer`` which does set it.
+def _get_offset_tokenizer(tokenizer):
+    """Return a ``tokenizers.Tokenizer`` for offset-aware encoding.
+
+    Resolution order:
+
+    1. If ``tokenizer`` is already a ``tokenizers.Tokenizer``, return
+       it as-is (BYO offset-capable tokenizer — no extra load).
+    2. If ``tokenizer.backend_tokenizer`` is a vanilla
+       ``tokenizers.Tokenizer`` (vanilla ``PreTrainedTokenizerFast``,
+       not fastokens-patched), use it directly — no extra load.
+    3. Load via ``tokenizers.Tokenizer.from_pretrained(name_or_path)``
+       and verify it encodes a probe string to the same ids as the
+       user's tokenizer. If they match, cache and use it. Pinned
+       ``TRUSTED_REVISIONS`` are honoured.
+    4. If the bare load diverges from the user's tokenizer, fall back
+       to ``transformers.AutoTokenizer`` and pull out *its* backend —
+       some models (MiniMax-M2.5) ship a ``tokenizer.json`` whose
+       pre_tokenizer disagrees with the AutoTokenizer-applied backend
+       mutations, so the bare load is incorrect for them. This is the
+       only branch that needs ``transformers``; we surface a clear
+       ``[transformers]`` extra hint if it's not installed.
+
+    Most models clear path 3 with no extra load and no ``transformers``
+    dependency.
     """
-    # Cheap probe: does this tokenizer already provide offsets?
-    try:
-        tokenizer("a", add_special_tokens=False, return_offsets_mapping=True)
+    from tokenizers import Tokenizer as RustTokenizer
+
+    # Path 1: already a tokenizers.Tokenizer.
+    if isinstance(tokenizer, RustTokenizer):
         return tokenizer
-    except (NotImplementedError, ValueError, TypeError):
-        pass
+
+    # Path 2: vanilla PreTrainedTokenizerFast exposes its underlying
+    # tokenizers.Tokenizer via ``backend_tokenizer``. Fastokens
+    # replaces that with a shim whose isinstance check fails — caught
+    # here, falls through.
+    backend = getattr(tokenizer, "backend_tokenizer", None)
+    if isinstance(backend, RustTokenizer):
+        return backend
 
     name_or_path = getattr(tokenizer, "name_or_path", "")
     if not name_or_path:
         raise RuntimeError(
             "Cannot construct an offset-aware tokenizer: the supplied "
             "tokenizer has no ``name_or_path`` to fall back on. Pass a "
-            "tokenizer loaded via ``renderers.base.load_tokenizer``."
+            "tokenizer loaded via ``renderers.base.load_tokenizer`` or "
+            "a ``tokenizers.Tokenizer`` directly."
         )
 
     with _offset_tokenizers_lock:
         cached = _offset_tokenizers.get(name_or_path)
         if cached is not None:
             return cached
 
-        kwargs: dict[str, Any] = {}
         revision = TRUSTED_REVISIONS.get(name_or_path)
+
+        # Path 3: bare ``tokenizers.Tokenizer.from_pretrained`` — works
+        # for almost all supported models, no ``transformers`` needed.
         if revision is not None:
-            kwargs = {"trust_remote_code": True, "revision": revision}
+            candidate = RustTokenizer.from_pretrained(name_or_path, revision=revision)
         else:
-            kwargs = {"trust_remote_code": False}
-        # Explicitly vanilla — we want HF's Rust tokenizer with offset
-        # tracking, not the fastokens shim. ``load_tokenizer`` would
-        # patch fastokens in by default; routing through
-        # ``_load_tokenizer_via_auto`` keeps the fastokens patch out
-        # of this code path while still applying the config-build
-        # fallback (RoPE-validation failures on nested
-        # ``rope_parameters``, etc.).
-        offset_tok = _load_tokenizer_via_auto(name_or_path, **kwargs)
-        if not getattr(offset_tok, "is_fast", False):
+            candidate = RustTokenizer.from_pretrained(name_or_path)
+
+        # Verify equivalence with the user's tokenizer on a probe that
+        # spans known boundary cases.
+        try:
+            user_ids = list(
+                tokenizer.encode(_OFFSET_PROBE_TEXT, add_special_tokens=False)
+            )
+            candidate_ids = list(
+                candidate.encode(_OFFSET_PROBE_TEXT, add_special_tokens=False).ids
+            )
+        except Exception:
+            user_ids = None
+            candidate_ids = None
+
+        if user_ids is not None and user_ids == candidate_ids:
+            _offset_tokenizers[name_or_path] = candidate
+            return candidate
+
+        # Path 4: bare load diverges from the user's tokenizer.
+        # ``AutoTokenizer`` mutates the backend at construction (e.g.
+        # substituting a ByteLevel pre_tokenizer); replicate by routing
+        # through it and pulling out the now-correct backend. Requires
+        # the optional ``transformers`` extra.
+        try:
+            from transformers import AutoTokenizer
+        except ImportError as exc:
+            raise ImportError(
+                f"Loading an offset-aware tokenizer for {name_or_path!r} via "
+                f"the bare ``tokenizers`` library produced a token stream "
+                f"that doesn't match the user's tokenizer (this happens for "
+                f"models whose ``AutoTokenizer`` mutates the backend at "
+                f"load, e.g. MiniMax). Install the optional ``transformers`` "
+                f"extra to enable the AutoTokenizer fallback: "
+                f"``pip install renderers[transformers]``."
+            ) from exc
+
+        kwargs: "dict[str, Any]" = (
+            {"trust_remote_code": True, "revision": revision}
+            if revision is not None
+            else {"trust_remote_code": False}
+        )
+        hf_tok = AutoTokenizer.from_pretrained(name_or_path, **kwargs)
+        offset_tok = getattr(hf_tok, "backend_tokenizer", None)
+        if not isinstance(offset_tok, RustTokenizer):
             raise RuntimeError(
-                f"Vanilla tokenizer for {name_or_path!r} is not a fast "
-                "tokenizer; offset_mapping is unavailable. Hand-coded "
-                "renderers require a fast tokenizer for body/scaffold "
-                "attribution."
+                f"AutoTokenizer.from_pretrained({name_or_path!r}) did not "
+                f"expose a ``tokenizers.Tokenizer`` backend; offset-aware "
+                f"encoding is unavailable for this model."
             )
         _offset_tokenizers[name_or_path] = offset_tok
         return offset_tok
@@ -1715,23 +1789,19 @@ def attribute_text_segments(
     (content, True)]`` for a user message. Concatenation is done before
     encoding to preserve BPE merges across the wrap/body boundary; the
     resulting tokens are then attributed back to their source segment
-    via the fast tokenizer's ``offset_mapping``.
+    via ``tokenizers.Encoding.offsets``.
 
     A token is attributed to the segment containing its first source
-    character (``offset_mapping[k][0]``). Tokens whose first character
-    falls exactly on a segment boundary are attributed to the segment
-    that *starts* at that offset (the "later" segment). Zero-length
-    tokens (rare; usually pre-tokenizer artefacts) are attributed to
-    the most recently entered segment.
-
-    Requires a HuggingFace fast tokenizer with offset tracking. The
-    ``fastokens`` patch ``load_tokenizer`` applies by default does
-    **not** track offsets — when that's the case we transparently load
-    a vanilla offset-capable tokenizer for the same model and cache it
-    (see :func:`_get_offset_tokenizer`). Hand-coded renderers are only
-    registered for model families that ship a fast tokenizer, so a
-    silent slow-tokenizer fallback isn't supported — BPE drift at the
-    wrap/body boundary would defeat the whole point.
+    character (``offsets[k][0]``). Tokens whose first character falls
+    exactly on a segment boundary go to the segment that *starts* at
+    that offset (the "later" segment). Zero-length tokens (rare;
+    pre-tokenizer artefacts) are attributed to the most recently
+    entered segment.
+
+    Uses ``tokenizers`` directly via :func:`_get_offset_tokenizer`; no
+    ``transformers`` dependency. Hand-coded renderers register only
+    for model families whose ``tokenizer.json`` ships a fast Rust
+    tokenizer, so the offset lookup always succeeds.
 
     Empty input or empty joined text returns an empty list.
     """
@@ -1742,47 +1812,34 @@ def attribute_text_segments(
         return []
 
     offset_tokenizer = _get_offset_tokenizer(tokenizer)
-    encoding = offset_tokenizer(
-        full_text,
-        add_special_tokens=False,
-        return_offsets_mapping=True,
-    )
-    token_ids = list(encoding["input_ids"])
-    offsets = list(encoding["offset_mapping"])
+    encoding = offset_tokenizer.encode(full_text, add_special_tokens=False)
+    token_ids = list(encoding.ids)
+    offsets = list(encoding.offsets)
 
     # Build segment char-span lookup. Track the half-open span
     # [seg_start, seg_end) of each segment and its is_content bit.
-    spans: list[tuple[int, int, bool]] = []
+    spans: "list[tuple[int, int, bool]]" = []
     pos = 0
     for text, is_content in segments:
         spans.append((pos, pos + len(text), is_content))
         pos += len(text)
     total_len = pos
 
-    out: list[tuple[int, bool]] = []
+    out: "list[tuple[int, bool]]" = []
     last_is_content = spans[-1][2] if spans else False
     for tok_id, (start, _end) in zip(token_ids, offsets):
         if start >= total_len:
-            # Token's character offset is past every segment (shouldn't
-            # normally happen for add_special_tokens=False, but defensive
-            # against tokenizer-specific edge cases).
+            # Token's char offset is past every segment (shouldn't
+            # normally happen for add_special_tokens=False, but defensive).
             out.append((tok_id, last_is_content))
             continue
-        # Find the segment that contains `start`. Segments are
-        # contiguous and ordered, so a linear scan is fine — the inner
-        # loop runs at most len(segments) times per token and segments
+        # Find the segment that contains `start`. Linear scan — segments
         # is typically 2-3 in practice.
         is_content = last_is_content
         for seg_start, seg_end, seg_is_content in spans:
             if seg_start <= start < seg_end:
                 is_content = seg_is_content
                 break
-        else:
-            # start == total_len handled above; the remaining case is
-            # an empty segment in the middle. Empty segments emit no
-            # characters, so no token can land in them; fall through to
-            # the last non-empty segment's bit.
-            pass
         out.append((tok_id, is_content))
     return out
 

diff --git a/renderers/deepseek_v3.py b/renderers/deepseek_v3.py
@@ -17,11 +17,11 @@
 from transformers.tokenization_utils import PreTrainedTokenizer
 
 from renderers.base import (
+    attribute_text_segments,
     Message,
     ParsedResponse,
     RenderedTokens,
     ToolSpec,
-    attribute_text_segments,
     extract_message_tool_names,
     reject_assistant_in_extension,
     trim_to_turn_close,
@@ -148,8 +148,25 @@ def emit_text(
         def emit_text_segments(
             segments: list[tuple[str, bool]], msg_idx: int, *, is_sampled: bool
         ) -> None:
+            collapsed: list[tuple[str, bool]] = []
+            for text, label in segments:
+                if not text:
+                    continue
+                if collapsed and collapsed[-1][1] == label:
+                    collapsed[-1] = (collapsed[-1][0] + text, label)
+                else:
+                    collapsed.append((text, label))
+            if not collapsed:
+                return
+            if len(collapsed) == 1:
+                # Homogeneous — single joined encode preserves all BPE merges.
+                text, label = collapsed[0]
+                emit_text(text, msg_idx, is_sampled=is_sampled, is_content=label)
+                return
+            # Mixed labels remain — joined encode + offset attribution handles
+            # BPE merges across label-transition boundaries (e.g., ``.\n\n``).
             for tok_id, is_content in attribute_text_segments(
-                self._tokenizer, segments
+                self._tokenizer, collapsed
             ):
                 tokens.append(tok_id)
                 indices.append(msg_idx)