diff --git a/renderers/base.py b/renderers/base.py index 8f722d7..ed5bc7e 100644 --- a/renderers/base.py +++ b/renderers/base.py @@ -1152,6 +1152,17 @@ def _model_has_vision_config(model_name: str) -> bool: } +# Tokenizer repos to use when a canonical model repo is gated but an +# audited unrestricted mirror ships byte-identical tokenizer files and +# chat_template. The returned tokenizer keeps the caller's original +# ``name_or_path`` so exact-match renderer resolution still uses +# ``MODEL_RENDERER_MAP``. +TOKENIZER_SOURCE_OVERRIDES: dict[str, str] = { + "meta-llama/Llama-3.2-1B-Instruct": "unsloth/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-3B-Instruct": "unsloth/Llama-3.2-3B-Instruct", +} + + # Models for which ``fastokens`` is known to diverge from vanilla # ``transformers.AutoTokenizer`` and therefore must NOT be patched. # Empirical audit ran each entry of ``MODEL_RENDERER_MAP`` through both @@ -1175,6 +1186,42 @@ def _model_has_vision_config(model_name: str) -> bool: _FASTOKENS_ANNOUNCED = False +def _tokenizer_source_for(model_name_or_path: str) -> str: + return TOKENIZER_SOURCE_OVERRIDES.get(model_name_or_path, model_name_or_path) + + +def _tokenizer_load_kwargs(model_name_or_path: str) -> dict[str, Any]: + revision = TRUSTED_REVISIONS.get(model_name_or_path) + if revision is not None: + return {"trust_remote_code": True, "revision": revision} + return {"trust_remote_code": False} + + +def _preserve_requested_tokenizer_name( + tokenizer, + *, + requested_name_or_path: str, + loaded_name_or_path: str, +): + if requested_name_or_path == loaded_name_or_path: + return tokenizer + + try: + tokenizer.name_or_path = requested_name_or_path + except Exception: + init_kwargs = getattr(tokenizer, "init_kwargs", None) + if isinstance(init_kwargs, dict): + init_kwargs["name_or_path"] = requested_name_or_path + + if getattr(tokenizer, "name_or_path", "") != requested_name_or_path: + raise RuntimeError( + f"Loaded tokenizer for {requested_name_or_path!r} from " + f"{loaded_name_or_path!r}, but could not preserve the requested " + "name_or_path for renderer auto-resolution." + ) + return tokenizer + + def _patched_load(model_name_or_path: str, **kwargs): """Run ``AutoTokenizer.from_pretrained`` with fastokens patched in process-locally — patch around the load, unpatch right after. @@ -1312,29 +1359,41 @@ def load_tokenizer( validation for configs with nested ``rope_parameters``), we fall back to loading the repo's self-contained ``tokenizer.json`` directly — see ``_load_tokenizer_via_auto``. - """ - kwargs: dict[str, Any] = {} - revision = TRUSTED_REVISIONS.get(model_name_or_path) - if revision is not None: - kwargs = {"trust_remote_code": True, "revision": revision} - else: - kwargs = {"trust_remote_code": False} - if not use_fastokens or model_name_or_path in FASTOKENS_INCOMPATIBLE: - return _load_tokenizer_via_auto(model_name_or_path, **kwargs) + Canonical Meta Llama-3.2 Instruct repos are gated on HuggingFace. For + those exact IDs we load tokenizer files from the audited unrestricted + ``unsloth`` mirrors instead, then restore ``tokenizer.name_or_path`` to + the requested Meta ID so auto-resolution still selects ``Llama3Renderer``. + """ + load_name_or_path = _tokenizer_source_for(model_name_or_path) + kwargs = _tokenizer_load_kwargs(load_name_or_path) + + if not use_fastokens or load_name_or_path in FASTOKENS_INCOMPATIBLE: + tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs) + return _preserve_requested_tokenizer_name( + tok, + requested_name_or_path=model_name_or_path, + loaded_name_or_path=load_name_or_path, + ) try: - return _patched_load(model_name_or_path, **kwargs) + tok = _patched_load(load_name_or_path, **kwargs) except Exception as exc: logger.info( "fastokens could not load %r (%s: %s); falling back to vanilla " "AutoTokenizer. Add this model to FASTOKENS_INCOMPATIBLE in " "renderers.base to suppress the retry.", - model_name_or_path, + load_name_or_path, type(exc).__name__, str(exc)[:160], ) - return _load_tokenizer_via_auto(model_name_or_path, **kwargs) + tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs) + + return _preserve_requested_tokenizer_name( + tok, + requested_name_or_path=model_name_or_path, + loaded_name_or_path=load_name_or_path, + ) def _populate_registry(): @@ -1702,12 +1761,8 @@ def _get_offset_tokenizer(tokenizer): if cached is not None: return cached - kwargs: dict[str, Any] = {} - revision = TRUSTED_REVISIONS.get(name_or_path) - if revision is not None: - kwargs = {"trust_remote_code": True, "revision": revision} - else: - kwargs = {"trust_remote_code": False} + load_name_or_path = _tokenizer_source_for(name_or_path) + kwargs = _tokenizer_load_kwargs(load_name_or_path) def _has_offsets(tok) -> bool: if not getattr(tok, "is_fast", False): @@ -1727,7 +1782,12 @@ def _has_offsets(tok) -> bool: # off — serialized against pool patch/unpatch via ``_FASTOKENS_PATCH_LOCK`` # so no concurrent window can swap the shim back in mid-load — then # restore the prior patch state. Never cache a non-offset tokenizer. - offset_tok = _load_tokenizer_via_auto(name_or_path, **kwargs) + offset_tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs) + offset_tok = _preserve_requested_tokenizer_name( + offset_tok, + requested_name_or_path=name_or_path, + loaded_name_or_path=load_name_or_path, + ) if not _has_offsets(offset_tok): import fastokens @@ -1737,7 +1797,12 @@ def _has_offsets(tok) -> bool: with contextlib.redirect_stdout(io.StringIO()): fastokens.unpatch_transformers() try: - offset_tok = _load_tokenizer_via_auto(name_or_path, **kwargs) + offset_tok = _load_tokenizer_via_auto(load_name_or_path, **kwargs) + offset_tok = _preserve_requested_tokenizer_name( + offset_tok, + requested_name_or_path=name_or_path, + loaded_name_or_path=load_name_or_path, + ) finally: if was_patched: with contextlib.redirect_stdout(io.StringIO()): diff --git a/tests/conftest.py b/tests/conftest.py index c3bfeed..aef435b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,11 +47,10 @@ # there's just no byte-output to parity-check against. Split-specific # parity (V3 bare prompt vs R1 +history-strip) is covered in # tests/test_deepseek_r1.py. - # Llama-3 loads via the unrestricted unsloth mirror (byte-identical - # chat template) so CI needs no Meta-gated HF token. Pinned to the - # explicit "llama-3" config because the mirror name isn't in - # MODEL_RENDERER_MAP (so "auto" would fall back to DefaultRenderer). - ("unsloth/Llama-3.2-1B-Instruct", "llama-3"), + # Llama-3 uses the canonical Meta ID for renderer auto-resolution, while + # load_tokenizer fetches the tokenizer/chat_template from the unrestricted + # unsloth mirror so CI needs no Meta-gated HF token. + ("meta-llama/Llama-3.2-1B-Instruct", "auto"), ("openai/gpt-oss-20b", "gpt-oss"), ("Qwen/Qwen2.5-0.5B-Instruct", "default"), ] @@ -139,7 +138,7 @@ def _skip_gpt_oss_for_hf_parity_tests(request): def _skip_llama_for_hf_parity_tests(request): callspec = getattr(request.node, "callspec", None) model_name = callspec.params.get("model_name") if callspec else None - if model_name != "unsloth/Llama-3.2-1B-Instruct": + if model_name != "meta-llama/Llama-3.2-1B-Instruct": return test_file = os.path.basename(str(request.node.fspath)) if test_file in _LLAMA_HF_PARITY_TEST_FILES: diff --git a/tests/test_llama_3.py b/tests/test_llama_3.py index 018c2f3..c6e4c75 100644 --- a/tests/test_llama_3.py +++ b/tests/test_llama_3.py @@ -1,10 +1,10 @@ """Llama-3 renderer coverage. Covers ``Llama3Renderer`` and the ``meta-llama/Llama-3.2-{1B,3B}-Instruct`` -entries in ``MODEL_RENDERER_MAP``. Tokenizers are loaded via the -unrestricted ``unsloth/Llama-3.2-{1B,3B}-Instruct`` mirrors (verified -byte-identical chat templates) so CI doesn't need an HF token with Meta -license access. +entries in ``MODEL_RENDERER_MAP``. ``load_tokenizer`` uses the +unrestricted ``unsloth/Llama-3.2-{1B,3B}-Instruct`` mirrors underneath +(verified byte-identical chat templates) so CI doesn't need an HF token +with Meta license access. """ from __future__ import annotations @@ -34,7 +34,7 @@ @pytest.fixture(scope="module", params=_MODEL_PAIRS, ids=[m for m, _ in _MODEL_PAIRS]) def llama_pair(request): canonical, mirror = request.param - tok = load_tokenizer(mirror) + tok = load_tokenizer(canonical) renderer = Llama3Renderer(tok, Llama3RendererConfig(date_string=_PINNED_DATE)) return canonical, mirror, tok, renderer @@ -58,6 +58,14 @@ def test_create_renderer_via_explicit_config(llama_pair): assert isinstance(r, Llama3Renderer) +def test_create_renderer_auto_resolves_after_mirror_load(llama_pair): + """``load_tokenizer(canonical_meta_id)`` loads from the unrestricted + mirror but preserves the canonical name needed for auto-resolution.""" + canonical, _, tok, _ = llama_pair + assert tok.name_or_path == canonical + assert isinstance(create_renderer(tok), Llama3Renderer) + + # --------------------------------------------------------------------------- # Constructor contract # --------------------------------------------------------------------------- diff --git a/tests/test_load_tokenizer.py b/tests/test_load_tokenizer.py index 5445687..b2e49c8 100644 --- a/tests/test_load_tokenizer.py +++ b/tests/test_load_tokenizer.py @@ -9,9 +9,11 @@ from __future__ import annotations import re +from types import SimpleNamespace from unittest.mock import patch -from renderers.base import TRUSTED_REVISIONS, load_tokenizer +from renderers import base +from renderers.base import TOKENIZER_SOURCE_OVERRIDES, TRUSTED_REVISIONS, load_tokenizer # --------------------------------------------------------------------------- @@ -70,6 +72,23 @@ def test_kimi_loads_with_pinned_revision(mock_from_pretrained): } +@patch("transformers.AutoTokenizer.from_pretrained") +def test_meta_llama_loads_tokenizer_from_unsloth_mirror(mock_from_pretrained): + """Canonical Meta Llama repos are gated; load their tokenizer/chat + template from the audited unrestricted mirror while preserving the + canonical name for renderer auto-resolution.""" + canonical = "meta-llama/Llama-3.2-1B-Instruct" + mirror = "unsloth/Llama-3.2-1B-Instruct" + mock_from_pretrained.return_value = SimpleNamespace(name_or_path=mirror) + + tok = load_tokenizer(canonical, use_fastokens=False) + + args, kwargs = mock_from_pretrained.call_args + assert args == (mirror,) + assert kwargs == {"trust_remote_code": False} + assert tok.name_or_path == canonical + + @patch("transformers.AutoTokenizer.from_pretrained") def test_unknown_path_falls_through_to_no_remote_code(mock_from_pretrained): """Unknown / fine-tuned model paths — including ``moonshotai/Kimi-K2*`` @@ -92,6 +111,53 @@ def test_unknown_path_falls_through_to_no_remote_code(mock_from_pretrained): ) +def test_tokenizer_source_overrides_are_exact_llama_mirrors(): + """Mirror overrides are intentionally narrow: only verified + byte-identical Llama tokenizer/template mirrors should live here.""" + assert TOKENIZER_SOURCE_OVERRIDES == { + "meta-llama/Llama-3.2-1B-Instruct": "unsloth/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-3B-Instruct": "unsloth/Llama-3.2-3B-Instruct", + } + + +def test_offset_tokenizer_uses_unsloth_mirror_for_meta_llama(monkeypatch): + """Offset-tokenizer reloads must use the same unrestricted source + override, otherwise Llama rendering can hit the gated Meta repo after + the initial tokenizer load succeeds.""" + + class _NoOffsets: + name_or_path = "meta-llama/Llama-3.2-1B-Instruct" + + def __call__(self, *args, **kwargs): + raise NotImplementedError("fastokens shim has no offsets") + + class _OffsetTokenizer: + is_fast = True + + def __init__(self, name_or_path: str): + self.name_or_path = name_or_path + + def __call__(self, *args, **kwargs): + return {"offset_mapping": [(0, 1)]} + + calls = [] + + def _fake_load(name_or_path, **kwargs): + calls.append((name_or_path, kwargs)) + return _OffsetTokenizer(name_or_path) + + base._offset_tokenizers.clear() + monkeypatch.setattr(base, "_load_tokenizer_via_auto", _fake_load) + + try: + tok = base._get_offset_tokenizer(_NoOffsets()) + finally: + base._offset_tokenizers.clear() + + assert calls == [("unsloth/Llama-3.2-1B-Instruct", {"trust_remote_code": False})] + assert tok.name_or_path == "meta-llama/Llama-3.2-1B-Instruct" + + # --------------------------------------------------------------------------- # Smoke: real tokenizer loads behave as expected # --------------------------------------------------------------------------- diff --git a/tests/test_preserve_thinking.py b/tests/test_preserve_thinking.py index 75b739b..daa4836 100644 --- a/tests/test_preserve_thinking.py +++ b/tests/test_preserve_thinking.py @@ -53,6 +53,8 @@ def _make(tokenizer, renderer_name, **flags): "poolside/Laguna-XS.2", # Llama-3 has no reasoning channel at all — preserve flags can't add # or drop anything, so they're pure no-ops. + "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-3B-Instruct", "unsloth/Llama-3.2-1B-Instruct", } @@ -324,6 +326,8 @@ def test_preserve_btc_on_live_cycle_matches_all( "Qwen/Qwen3-VL-30B-A3B-Instruct", # Llama-3 ships no rendering path, so reasoning_content never # surfaces in the output regardless of the preserve flags. + "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-3B-Instruct", "unsloth/Llama-3.2-1B-Instruct", }