feat(tests): add --models-dir flag for read-only HF cache (#8362)

Signed-off-by: rrubin <rrubin@nvidia.com>

feat(tests): add --models-dir flag for read-only HF cache (#8362)
Signed-off-by: rrubin <rrubin@nvidia.com>
b6885977 · Ran Rubin · GitHub · 4b3cd459 · b6885977 · b6885977
Unverified Commit b6885977 authored Apr 21, 2026 by Ran Rubin Committed by GitHub Apr 21, 2026
5 changed files
--- a/.ai/pytest-guidelines.md
+++ b/.ai/pytest-guidelines.md
@@ -36,6 +36,34 @@ export HF_HUB_OFFLINE=1 HF_TOKEN="$(cat ~/.cache/huggingface/token)"
 python3 -m pytest -xvv --basetemp=/tmp/pytest_temp --durations=0 tests/
 ```

+### Running against a pre-populated local model cache
+
+If you have models already downloaded into a read-only directory (e.g. a shared
+NFS mount or a bind-mounted volume), pass `--models-dir` to skip all network
+downloads and avoid any writes to the cache:
+
+```bash
+python3 -m pytest --models-dir=/path/to/hf_cache -xvv tests/serve/test_vllm.py
+```
+
+Accepts either a **bare `HF_HUB_CACHE` directory** (contains `models--org--name/`
+subdirs) or an **`HF_HOME` directory** (auto-detected: if a `hub/` subdirectory is
+present, `HF_HOME` is used; otherwise `HF_HUB_CACHE` is used). A warning is logged
+when the `HF_HOME` layout is detected so you can verify the choice is correct.
+
+What `--models-dir` does:
+- Sets `HF_HUB_CACHE` (or `HF_HOME`) to the supplied path.
+- Enables `HF_HUB_OFFLINE=1` and `TRANSFORMERS_OFFLINE=1` — no network calls.
+- Short-circuits `predownload_models` and `predownload_tokenizers` — no writes to
+  the cache directory.
+- Sets `DYNAMO_MODELS_DIR` — code that would perform network downloads (e.g. LoRA
+  adapters in `download_lora()`) will `pytest.skip()` instead of failing.
+
+**LoRA tests are incompatible with `--models-dir`** because they download adapters
+from HuggingFace Hub at test time. Tests that call `download_lora()` will be
+skipped automatically with a clear message when the flag is active. To run LoRA
+tests locally, omit `--models-dir` and ensure `HF_TOKEN` is set.
+
 - `python3 -m pytest` ensures the venv's pytest runs with the correct `sys.path`.
  The system `pytest` at `/usr/local/bin/pytest` is **outside** the venv and cannot
  see venv-installed packages (like `dynamo`).

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,6 +12,12 @@ from typing import Generator, Optional
 import pytest
 from filelock import FileLock

+from tests.hf_cache import (
+    _apply_models_dir_env,
+    _disable_offline_with_mistral_patch,
+    _enable_offline_with_mistral_patch,
+    _restore_models_dir_env,
+)
 from tests.utils.constants import TEST_MODELS, DefaultPort
 from tests.utils.managed_process import ManagedProcess
 from tests.utils.port_utils import (
@@ -76,6 +82,21 @@ def pytest_addoption(parser: pytest.Parser) -> None:
        default=False,
        help="Show which tests would run vs skip based on --max-vram-gib, then exit.",
    )
+    # -------------------------------------------------------------------------
+    # Model cache options
+    # -------------------------------------------------------------------------
+    # NOTE: if you add a new option here, also add it to the forwarding list
+    # in pytest_runtestloop (search for "opt_name, cli_flag" in this file).
+    parser.addoption(
+        "--models-dir",
+        type=str,
+        default=None,
+        help=(
+            "Path to a pre-populated HuggingFace cache (read-only safe). "
+            "Enables HF_HUB_OFFLINE mode and skips predownload fixtures. "
+            "See .ai/pytest-guidelines.md for full details."
+        ),
+    )


 def pytest_runtest_setup(item):
@@ -127,7 +148,14 @@ logging.basicConfig(


 def pytest_configure(config: pytest.Config) -> None:
-    """Detect GPUs for --max-vram-gib planning and parallel execution."""
+    """Configure session: validate --models-dir and detect GPUs for --max-vram-gib."""
+    models_dir = config.getoption("--models-dir", default=None)
+    if models_dir and not Path(models_dir).is_dir():
+        pytest.exit(
+            f"--models-dir: directory does not exist: {models_dir}",
+            returncode=2,
+        )
+
    vram_limit = config.getoption("max_vram_gib", default=None)
    if vram_limit is None:
        return
@@ -227,6 +255,9 @@ def pytest_runtestloop(session: pytest.Session) -> bool | None:
        val = config.getoption(opt_name, default=None)
        if val is not None:
            extra_args.extend([cli_flag, str(val)])
+    models_dir = config.getoption("--models-dir", default=None)
+    if models_dir is not None:
+        extra_args.extend(["--models-dir", str(models_dir)])
    if config.getoption("skip_service_restart", default=None):
        extra_args.append("--skip-service-restart")

@@ -333,97 +364,44 @@ def download_models(model_list=None, ignore_weights=False):
        )


-def _enable_offline_with_mistral_patch():
-    """Set HF_HUB_OFFLINE=1 and work around a transformers 4.57.3 regression.
-
-    transformers 4.57.3 (PR #42389) introduced _patch_mistral_regex which calls
-    huggingface_hub.model_info() unconditionally for every tokenizer load — even
-    non-Mistral models with fully cached weights. This API call fails when
-    HF_HUB_OFFLINE=1.
+_download_lock_path = os.path.join(tempfile.gettempdir(), "pytest_model_download.lock")

-    Since tests launch TRT-LLM workers as subprocesses that inherit env vars but
-    not in-process monkey-patches, we inject the fix via a sitecustomize.py on
-    PYTHONPATH so every subprocess auto-applies it at startup.

-    Upstream bug: https://github.com/huggingface/transformers/issues/44843
+@pytest.fixture(scope="session", autouse=True)
+def _models_dir_env(pytestconfig):
+    """Set up HF env vars for --models-dir mode. No-op when flag is absent.

-    TODO: Remove this workaround once transformers ships a fix and TRT-LLM (or
-    any other dependency) upgrades to that fixed version.
+    Session-scoped: runs once per worker process. Under pytest-xdist each worker
+    applies and restores env vars independently — there is no cross-worker
+    coordination needed since env vars are process-local.
    """
-    os.environ["HF_HUB_OFFLINE"] = "1"
-
-    # Apply the patch in this process
+    models_dir = pytestconfig.getoption("--models-dir")
+    if not models_dir:
+        yield
+        return
+    orig = _apply_models_dir_env(models_dir)
    try:
-        from huggingface_hub.errors import OfflineModeIsEnabled
-        from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-
-        original = PreTrainedTokenizerBase._patch_mistral_regex
-
-        @classmethod  # type: ignore[misc]
-        def _safe_patch(cls, tokenizer, *args, **kwargs):
-            try:
-                return original.__func__(cls, tokenizer, *args, **kwargs)
-            except OfflineModeIsEnabled:
-                return tokenizer
-
-        PreTrainedTokenizerBase._patch_mistral_regex = _safe_patch
-    except (ImportError, AttributeError):
-        return  # transformers version without _patch_mistral_regex — nothing to do
-
-    # Write a sitecustomize.py so subprocesses also get the patch.
-    # Use a per-worker dir under xdist to avoid write races.
-    worker_id = os.environ.get("PYTEST_XDIST_WORKER", "main")
-    patch_dir = os.path.join(tempfile.gettempdir(), f"dynamo_test_hf_patch_{worker_id}")
-    os.makedirs(patch_dir, exist_ok=True)
-    with open(os.path.join(patch_dir, "sitecustomize.py"), "w") as f:
-        f.write(
-            "import os\n"
-            "if os.environ.get('HF_HUB_OFFLINE') == '1':\n"
-            "    try:\n"
-            "        from transformers.tokenization_utils_base import"
-            " PreTrainedTokenizerBase as _T\n"
-            "        from huggingface_hub.errors import"
-            " OfflineModeIsEnabled as _E\n"
-            "        _orig = _T._patch_mistral_regex\n"
-            "        @classmethod\n"
-            "        def _safe(cls, tokenizer, *a, **kw):\n"
-            "            try:\n"
-            "                return _orig.__func__(cls, tokenizer, *a, **kw)\n"
-            "            except _E:\n"
-            "                return tokenizer\n"
-            "        _T._patch_mistral_regex = _safe\n"
-            "    except (ImportError, AttributeError):\n"
-            "        pass\n"
-        )
-    pythonpath = os.environ.get("PYTHONPATH", "")
-    os.environ["PYTHONPATH"] = f"{patch_dir}:{pythonpath}" if pythonpath else patch_dir
-    logging.info(
-        "Enabled HF_HUB_OFFLINE with _patch_mistral_regex workaround "
-        "(see https://github.com/huggingface/transformers/issues/44843)"
-    )
-
-
-def _disable_offline_with_mistral_patch():
-    """Undo _enable_offline_with_mistral_patch."""
-    os.environ.pop("HF_HUB_OFFLINE", None)
-    worker_id = os.environ.get("PYTEST_XDIST_WORKER", "main")
-    patch_dir = os.path.join(tempfile.gettempdir(), f"dynamo_test_hf_patch_{worker_id}")
-    pythonpath = os.environ.get("PYTHONPATH", "")
-    os.environ["PYTHONPATH"] = pythonpath.replace(f"{patch_dir}:", "").replace(
-        patch_dir, ""
-    )
-
-
-_download_lock_path = os.path.join(tempfile.gettempdir(), "pytest_model_download.lock")
+        yield
+    finally:
+        _restore_models_dir_env(orig)


 @pytest.fixture(scope="session")
-def predownload_models(pytestconfig):
+def predownload_models(pytestconfig, _models_dir_env):
    """Fixture wrapper around download_models for models used in collected tests.

    Uses a file lock so that under xdist, only one worker downloads at a time
    and the rest reuse the HuggingFace cache.
+
+    When --models-dir is passed, _models_dir_env has already set up HF env vars;
+    this fixture simply yields without downloading.
+
+    _models_dir_env is declared as a dependency to ensure HF env vars are
+    configured before any download attempt, even though its yielded value is unused.
    """
+    if pytestconfig.getoption("--models-dir"):
+        yield
+        return
    models = getattr(pytestconfig, "models_to_download", None)
    with FileLock(_download_lock_path):
        if models:
@@ -440,11 +418,20 @@ def predownload_models(pytestconfig):


 @pytest.fixture(scope="session")
-def predownload_tokenizers(pytestconfig):
+def predownload_tokenizers(pytestconfig, _models_dir_env):
    """Fixture wrapper around download_models for tokenizers used in collected tests.

    Uses a file lock so that under xdist, only one worker downloads at a time.
+
+    When --models-dir is passed, _models_dir_env has already set up HF env vars;
+    this fixture simply yields without downloading.
+
+    _models_dir_env is declared as a dependency to ensure HF env vars are
+    configured before any download attempt, even though its yielded value is unused.
    """
+    if pytestconfig.getoption("--models-dir"):
+        yield
+        return
    models = getattr(pytestconfig, "models_to_download", None)
    with FileLock(_download_lock_path):
        if models:

--- a/tests/hf_cache.py
+++ b/tests/hf_cache.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+import shutil
+import tempfile
+import textwrap
+from pathlib import Path
+
+_mistral_patch_applied: bool = False
+
+
+def _enable_offline_with_mistral_patch():
+    """Set HF_HUB_OFFLINE=1 and work around a transformers 4.57.3 regression.
+
+    transformers 4.57.3 (PR #42389) introduced _patch_mistral_regex which calls
+    huggingface_hub.model_info() unconditionally for every tokenizer load — even
+    non-Mistral models with fully cached weights. This API call fails when
+    HF_HUB_OFFLINE=1.
+
+    Since tests launch TRT-LLM workers as subprocesses that inherit env vars but
+    not in-process monkey-patches, we inject the fix via a sitecustomize.py on
+    PYTHONPATH so every subprocess auto-applies it at startup.
+
+    _mistral_patch_applied guards the class-level patch and PYTHONPATH injection
+    so they run at most once per enable/disable cycle. _disable_offline_with_mistral_patch
+    resets the flag so a subsequent enable call re-injects PYTHONPATH; the class-level
+    re-application on that second call is harmless — it adds one extra try/except layer
+    that behaves identically to the first.
+
+    Upstream bug: https://github.com/huggingface/transformers/issues/44843
+
+    TODO: Remove this workaround once transformers ships a fix and TRT-LLM (or
+    any other dependency) upgrades to that fixed version.
+    """
+    global _mistral_patch_applied
+    os.environ["HF_HUB_OFFLINE"] = "1"
+    if _mistral_patch_applied:
+        return  # class patch and sitecustomize already applied for this cycle
+
+    # Resolve OfflineModeIsEnabled before touching transformers. If huggingface_hub
+    # predates the .errors module, transformers 4.57.3+ imports OfflineModeIsEnabled
+    # lazily inside _patch_mistral_regex, so that call itself raises ImportError under
+    # offline mode — using ImportError as the fallback catches that exact error.
+    try:
+        from huggingface_hub.errors import OfflineModeIsEnabled
+    except ImportError:
+        OfflineModeIsEnabled = ImportError  # type: ignore[assignment,misc]
+
+    # Apply the patch in this process
+    try:
+        from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+        original = PreTrainedTokenizerBase._patch_mistral_regex
+
+        @classmethod  # type: ignore[misc]
+        def _safe_patch(cls, tokenizer, *args, **kwargs):
+            try:
+                return original.__func__(cls, tokenizer, *args, **kwargs)
+            except OfflineModeIsEnabled:
+                return tokenizer
+
+        PreTrainedTokenizerBase._patch_mistral_regex = _safe_patch
+    except (ImportError, AttributeError):
+        return  # transformers version without _patch_mistral_regex — nothing to do
+
+    # Write a sitecustomize.py so subprocesses also get the patch.
+    # Use a per-worker dir under xdist to avoid write races.
+    worker_id = os.environ.get("PYTEST_XDIST_WORKER", "main")
+    patch_dir = os.path.join(tempfile.gettempdir(), f"dynamo_test_hf_patch_{worker_id}")
+    os.makedirs(patch_dir, exist_ok=True)
+    with open(os.path.join(patch_dir, "sitecustomize.py"), "w") as f:
+        f.write(
+            textwrap.dedent(
+                """\
+            import os
+            if os.environ.get('HF_HUB_OFFLINE') == '1':
+                try:
+                    from transformers.tokenization_utils_base import PreTrainedTokenizerBase as _T
+                    try:
+                        from huggingface_hub.errors import OfflineModeIsEnabled as _E
+                    except ImportError:
+                        _E = ImportError
+                    _orig = _T._patch_mistral_regex
+                    @classmethod
+                    def _safe(cls, tokenizer, *a, **kw):
+                        try:
+                            return _orig.__func__(cls, tokenizer, *a, **kw)
+                        except _E:
+                            return tokenizer
+                    _T._patch_mistral_regex = _safe
+                except (ImportError, AttributeError):
+                    pass
+        """
+            )
+        )
+    existing_entries = [e for e in os.environ.get("PYTHONPATH", "").split(":") if e]
+    os.environ["PYTHONPATH"] = ":".join([patch_dir] + existing_entries)
+    logging.info(
+        "Enabled HF_HUB_OFFLINE with _patch_mistral_regex workaround "
+        "(see https://github.com/huggingface/transformers/issues/44843)"
+    )
+    _mistral_patch_applied = True
+
+
+def _disable_offline_with_mistral_patch():
+    """Undo _enable_offline_with_mistral_patch."""
+    global _mistral_patch_applied
+    os.environ.pop("HF_HUB_OFFLINE", None)
+    worker_id = os.environ.get("PYTEST_XDIST_WORKER", "main")
+    patch_dir = os.path.join(tempfile.gettempdir(), f"dynamo_test_hf_patch_{worker_id}")
+    pythonpath = os.environ.get("PYTHONPATH", "")
+    result = ":".join(e for e in pythonpath.split(":") if e and e != patch_dir)
+    if result:
+        os.environ["PYTHONPATH"] = result
+    else:
+        os.environ.pop("PYTHONPATH", None)
+    shutil.rmtree(patch_dir, ignore_errors=True)
+    _mistral_patch_applied = False
+
+
+# Keys managed by _apply_models_dir_env / _restore_models_dir_env.
+# PYTHONPATH is intentionally excluded: _disable_offline_with_mistral_patch()
+# removes its entry by exact-match list filtering (idempotent, needs no snapshot).
+_TRANSFORMERS_CACHE_OVERRIDE_KEYS = (
+    "TRANSFORMERS_CACHE",
+    "PYTORCH_TRANSFORMERS_CACHE",
+    "PYTORCH_PRETRAINED_BERT_CACHE",
+)
+
+_MODELS_DIR_ENV_KEYS = (
+    "HF_HUB_CACHE",
+    "HF_HOME",
+    *_TRANSFORMERS_CACHE_OVERRIDE_KEYS,
+    "HF_HUB_OFFLINE",
+    "TRANSFORMERS_OFFLINE",
+    "DYNAMO_MODELS_DIR",
+)
+
+
+def _apply_models_dir_env(models_dir: str) -> dict:
+    """Set HF env vars for read-only cache mode. Returns original env values."""
+    orig = {k: os.environ.get(k) for k in _MODELS_DIR_ENV_KEYS}
+    if (Path(models_dir) / "hub").is_dir():
+        logging.warning(
+            "--models-dir: detected HF_HOME layout (hub/ subdirectory found). "
+            "If this is wrong (e.g. you have a model named hub/), rename hub/ "
+            "or pass a bare HF_HUB_CACHE directory instead."
+        )
+        os.environ.pop("HF_HUB_CACHE", None)  # clear so HF_HOME takes effect
+        os.environ["HF_HOME"] = models_dir
+    else:
+        logging.info("--models-dir: detected bare HF_HUB_CACHE layout")
+        os.environ.pop("HF_HOME", None)  # clear for consistency
+        os.environ["HF_HUB_CACHE"] = models_dir
+    for key in _TRANSFORMERS_CACHE_OVERRIDE_KEYS:
+        os.environ.pop(key, None)
+    os.environ["HF_HUB_OFFLINE"] = "1"
+    os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    os.environ["DYNAMO_MODELS_DIR"] = models_dir
+    _enable_offline_with_mistral_patch()  # activates sitecustomize for Mistral tokenizer workaround
+    return orig
+
+
+def _restore_models_dir_env(orig: dict) -> None:
+    """Undo _apply_models_dir_env. Call after fixture yield."""
+    # _disable pops HF_HUB_OFFLINE; the loop below then restores the original value
+    # (no-op if orig was None, set-back if orig had a pre-existing value). Safe.
+    _disable_offline_with_mistral_patch()  # pops HF_HUB_OFFLINE + cleans sitecustomize
+    for k, v in orig.items():
+        if v is None:
+            os.environ.pop(k, None)
+        else:
+            os.environ[k] = v
--- a/tests/serve/lora_utils.py
+++ b/tests/serve/lora_utils.py
@@ -19,6 +19,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Optional

 import boto3
+import pytest
 import requests
 from botocore.client import Config
 from botocore.exceptions import ClientError
@@ -232,7 +233,16 @@ class MinioService:
                raise RuntimeError(f"Failed to check bucket: {e}") from e

    def download_lora(self) -> str:
-        """Download LoRA from Hugging Face Hub, returns temp directory path."""
+        """Download LoRA from Hugging Face Hub, returns temp directory path.
+
+        Skips via pytest.skip() when DYNAMO_MODELS_DIR is set (--models-dir active).
+        """
+        if os.environ.get("DYNAMO_MODELS_DIR"):
+            pytest.skip(
+                "--models-dir is active (read-only cache mode): LoRA network download suppressed. "
+                "Pre-stage LoRA adapters into the cache or omit --models-dir to enable downloads."
+            )
+
        self._temp_download_dir = tempfile.mkdtemp(prefix="lora_download_")
        self._logger.info(
            f"Downloading LoRA {self.config.lora_repo} to {self._temp_download_dir}"

--- a/tests/test_models_dir_flag.py
+++ b/tests/test_models_dir_flag.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+import tests.hf_cache as hf_cache
+from tests.hf_cache import (
+    _MODELS_DIR_ENV_KEYS,
+    _TRANSFORMERS_CACHE_OVERRIDE_KEYS,
+    _apply_models_dir_env,
+    _disable_offline_with_mistral_patch,
+    _enable_offline_with_mistral_patch,
+    _restore_models_dir_env,
+)
+from tests.serve.lora_utils import MinioLoraConfig, MinioService
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+def test_apply_bare_cache_layout(tmp_path, monkeypatch):
+    for k in _MODELS_DIR_ENV_KEYS:
+        monkeypatch.delenv(k, raising=False)
+    monkeypatch.delenv("PYTHONPATH", raising=False)
+    orig = _apply_models_dir_env(str(tmp_path))
+    try:
+        assert os.environ["HF_HUB_CACHE"] == str(tmp_path)
+        assert "HF_HOME" not in os.environ
+        assert os.environ["HF_HUB_OFFLINE"] == "1"
+        assert os.environ["TRANSFORMERS_OFFLINE"] == "1"
+        assert os.environ["DYNAMO_MODELS_DIR"] == str(tmp_path)
+        for k in _TRANSFORMERS_CACHE_OVERRIDE_KEYS:
+            assert k not in os.environ
+    finally:
+        _restore_models_dir_env(orig)
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+def test_apply_hf_home_layout(tmp_path, monkeypatch):
+    for k in _MODELS_DIR_ENV_KEYS:
+        monkeypatch.delenv(k, raising=False)
+    monkeypatch.delenv("PYTHONPATH", raising=False)
+    (tmp_path / "hub").mkdir()
+    orig = _apply_models_dir_env(str(tmp_path))
+    try:
+        assert os.environ["HF_HOME"] == str(tmp_path)
+        assert "HF_HUB_CACHE" not in os.environ
+        assert os.environ["HF_HUB_OFFLINE"] == "1"
+        assert os.environ["TRANSFORMERS_OFFLINE"] == "1"
+        assert os.environ["DYNAMO_MODELS_DIR"] == str(tmp_path)
+        for k in _TRANSFORMERS_CACHE_OVERRIDE_KEYS:
+            assert k not in os.environ
+    finally:
+        _restore_models_dir_env(orig)
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+def test_restore_clears_vars_that_were_absent(tmp_path, monkeypatch):
+    for k in _MODELS_DIR_ENV_KEYS:
+        monkeypatch.delenv(k, raising=False)
+    monkeypatch.delenv("PYTHONPATH", raising=False)
+    orig = _apply_models_dir_env(str(tmp_path))
+    _restore_models_dir_env(orig)
+    for k in _MODELS_DIR_ENV_KEYS:
+        assert k not in os.environ
+    assert "PYTHONPATH" not in os.environ
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+@pytest.mark.parametrize("use_hf_home", [False, True])
+def test_restore_preserves_preexisting_values(tmp_path, monkeypatch, use_hf_home):
+    if use_hf_home:
+        (tmp_path / "hub").mkdir()
+    sentinel = {k: f"preexisting_{k}" for k in _MODELS_DIR_ENV_KEYS}
+    for k, v in sentinel.items():
+        monkeypatch.setenv(k, v)
+    orig = _apply_models_dir_env(str(tmp_path))
+    _restore_models_dir_env(orig)
+    for k, v in sentinel.items():
+        assert os.environ[k] == v
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+@pytest.mark.timeout(60)
+def test_models_dir_nonexistent_exits_with_code_2(tmp_path):
+    missing = tmp_path / "no_such_dir"
+    # Run from the project root so conftest.py is discovered and --models-dir
+    # is registered before pytest_configure fires.
+    # Note: the child pytest process collects from this file itself — keep
+    # module-level imports here side-effect-free to avoid spurious child failures.
+    project_root = Path(__file__).parents[1]
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "pytest",
+            f"--models-dir={missing}",
+            "--collect-only",
+            "tests/test_models_dir_flag.py",
+        ],
+        capture_output=True,
+        text=True,
+        cwd=str(project_root),
+        timeout=30,
+    )
+    assert result.returncode == 2
+    assert "does not exist" in result.stderr + result.stdout
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+def test_download_lora_skips_in_models_dir_mode(tmp_path, monkeypatch):
+    monkeypatch.setenv("DYNAMO_MODELS_DIR", str(tmp_path))
+    service = MinioService(MinioLoraConfig())
+    with pytest.raises(pytest.skip.Exception, match="read-only cache mode"):
+        service.download_lora()
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+def test_disable_removes_patch_dir(monkeypatch):
+    """_disable_offline_with_mistral_patch cleans up the sitecustomize patch directory."""
+    import tempfile
+
+    monkeypatch.delenv("PYTHONPATH", raising=False)
+    monkeypatch.delenv("HF_HUB_OFFLINE", raising=False)
+    monkeypatch.setattr(hf_cache, "_mistral_patch_applied", False)
+
+    worker_id = os.environ.get("PYTEST_XDIST_WORKER", "main")
+    patch_dir = os.path.join(tempfile.gettempdir(), f"dynamo_test_hf_patch_{worker_id}")
+
+    os.makedirs(patch_dir, exist_ok=True)
+    (Path(patch_dir) / "sitecustomize.py").write_text("# stub")
+    monkeypatch.setenv("PYTHONPATH", patch_dir)
+
+    _disable_offline_with_mistral_patch()
+
+    assert not Path(patch_dir).exists()
+    assert "PYTHONPATH" not in os.environ
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+def test_enable_normalizes_pythonpath_empty_components(monkeypatch):
+    """_enable_offline_with_mistral_patch filters empty components from PYTHONPATH."""
+    monkeypatch.setenv("PYTHONPATH", ":some:existing:path:")
+    monkeypatch.delenv("HF_HUB_OFFLINE", raising=False)
+    monkeypatch.setattr(hf_cache, "_mistral_patch_applied", False)
+    try:
+        from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+        monkeypatch.setattr(
+            PreTrainedTokenizerBase,
+            "_patch_mistral_regex",
+            classmethod(lambda cls, t, *a, **kw: t),
+            raising=False,
+        )
+    except ImportError:
+        pytest.skip("transformers not installed")
+
+    _enable_offline_with_mistral_patch()
+    pythonpath = os.environ.get("PYTHONPATH", "")
+    assert "" not in pythonpath.split(
+        ":"
+    ), f"Empty component in PYTHONPATH: {pythonpath!r}"
+
+    _disable_offline_with_mistral_patch()
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+def test_pythonpath_restored_after_apply_restore(tmp_path, monkeypatch):
+    original = "some:existing:path"
+    monkeypatch.setenv("PYTHONPATH", original)
+    for k in _MODELS_DIR_ENV_KEYS:
+        monkeypatch.delenv(k, raising=False)
+    monkeypatch.setattr(hf_cache, "_mistral_patch_applied", False)
+    orig = _apply_models_dir_env(str(tmp_path))
+    _restore_models_dir_env(orig)
+    assert os.environ["PYTHONPATH"] == original
+
+
+@pytest.mark.pre_merge
+@pytest.mark.unit
+@pytest.mark.gpu_0
+def test_enable_disable_enable_cycle(monkeypatch):
+    """_enable/_disable is safe to call in sequence; PYTHONPATH and HF_HUB_OFFLINE are correct after each call."""
+    monkeypatch.delenv("PYTHONPATH", raising=False)
+    monkeypatch.delenv("HF_HUB_OFFLINE", raising=False)
+    monkeypatch.setattr(hf_cache, "_mistral_patch_applied", False)
+
+    # Inject a no-op _patch_mistral_regex so the test always exercises the full
+    # patching code path, regardless of the installed transformers version.
+    try:
+        from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+
+        if not hasattr(PreTrainedTokenizerBase, "_patch_mistral_regex"):
+
+            @classmethod  # type: ignore[misc]
+            def _noop_patch(cls, tokenizer, *args, **kwargs):
+                return tokenizer
+
+            monkeypatch.setattr(
+                PreTrainedTokenizerBase,
+                "_patch_mistral_regex",
+                _noop_patch,
+                raising=False,
+            )
+    except ImportError:
+        pytest.skip("transformers not installed")
+
+    _enable_offline_with_mistral_patch()
+    assert os.environ.get("HF_HUB_OFFLINE") == "1"
+    assert hf_cache._mistral_patch_applied is True
+    pythonpath_after_enable = os.environ.get("PYTHONPATH")
+
+    _disable_offline_with_mistral_patch()
+    assert "HF_HUB_OFFLINE" not in os.environ
+    assert hf_cache._mistral_patch_applied is False
+    assert os.environ.get("PYTHONPATH") is None
+
+    _enable_offline_with_mistral_patch()
+    assert os.environ.get("HF_HUB_OFFLINE") == "1"
+    assert hf_cache._mistral_patch_applied is True
+    assert os.environ.get("PYTHONPATH") == pythonpath_after_enable
+
+    _disable_offline_with_mistral_patch()
+    assert hf_cache._mistral_patch_applied is False