[Bugfix] Offline mode fix (#8376)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>

[Bugfix] Offline mode fix (#8376)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
f2e263b8 · Joe Runde · GitHub · 1f0c75af · f2e263b8 · f2e263b8
Unverified Commit f2e263b8 authored Sep 12, 2024 by Joe Runde Committed by GitHub Sep 12, 2024
4 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -91,6 +91,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/openai
  - pytest -v -s entrypoints/test_chat_utils.py
+  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 - label: Distributed Tests (4 GPUs) # 10min

--- a/tests/entrypoints/offline_mode/__init__.py
+++ b/tests/entrypoints/offline_mode/__init__.py
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
+"""Tests for HF_HUB_OFFLINE mode"""
+import importlib
+import sys
+import weakref
+import pytest
+from vllm import LLM
+from ...conftest import cleanup
+MODEL_NAME = "facebook/opt-125m"
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              max_num_batched_tokens=4096,
+              tensor_parallel_size=1,
+              gpu_memory_utilization=0.10,
+              enforce_eager=True)
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+        del llm
+    cleanup()
+@pytest.mark.skip_global_cleanup
+def test_offline_mode(llm: LLM, monkeypatch):
+    # we use the llm fixture to ensure the model files are in-cache
+    del llm
+    # Set HF to offline mode and ensure we can still construct an LLM
+    try:
+        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        # Need to re-import huggingface_hub and friends to setup offline mode
+        _re_import_modules()
+        # Cached model files should be used in offline mode
+        LLM(model=MODEL_NAME,
+            max_num_batched_tokens=4096,
+            tensor_parallel_size=1,
+            gpu_memory_utilization=0.10,
+            enforce_eager=True)
+    finally:
+        # Reset the environment after the test
+        # NB: Assuming tests are run in online mode
+        monkeypatch.delenv("HF_HUB_OFFLINE")
+        _re_import_modules()
+        pass
+def _re_import_modules():
+    hf_hub_module_names = [
+        k for k in sys.modules if k.startswith("huggingface_hub")
+    ]
+    transformers_module_names = [
+        k for k in sys.modules if k.startswith("transformers")
+        and not k.startswith("transformers_modules")
+    ]
+    reload_exception = None
+    for module_name in hf_hub_module_names + transformers_module_names:
+        try:
+            importlib.reload(sys.modules[module_name])
+        except Exception as e:
+            reload_exception = e
+            # Try to continue clean up so that other tests are less likely to
+            # be affected
+    # Error this test if reloading a module failed
+    if reload_exception is not None:
+        raise reload_exception
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -4,7 +4,9 @@ import json
 from pathlib import Path
 from typing import Any, Dict, Optional, Type, Union
-from huggingface_hub import file_exists, hf_hub_download
+import huggingface_hub
+from huggingface_hub import (file_exists, hf_hub_download,
+                             try_to_load_from_cache)
 from transformers import GenerationConfig, PretrainedConfig
 from transformers.models.auto.image_processing_auto import (
    get_image_processor_config)
@@ -70,7 +72,22 @@ def file_or_path_exists(model: Union[str, Path], config_name, revision,
    if Path(model).exists():
        return (Path(model) / config_name).is_file()
+    # Offline mode support: Check if config file is cached already
+    cached_filepath = try_to_load_from_cache(repo_id=model,
+                                             filename=config_name,
+                                             revision=revision)
+    if isinstance(cached_filepath, str):
+        # The config file exists in cache- we can continue trying to load
+        return True
+    # NB: file_exists will only check for the existence of the config file on
+    # hf_hub. This will fail in offline mode.
+    try:
        return file_exists(model, config_name, revision=revision, token=token)
+    except huggingface_hub.errors.OfflineModeIsEnabled:
+        # Don't raise in offline mode, all we know is that we don't have this
+        # file cached.
+        return False
 def get_config(
@@ -102,6 +119,15 @@ def get_config(
                                 token=kwargs.get("token")):
            config_format = ConfigFormat.MISTRAL
        else:
+            # If we're in offline mode and found no valid config format, then
+            # raise an offline mode error to indicate to the user that they
+            # don't have files cached and may need to go online.
+            # This is conveniently triggered by calling file_exists().
+            file_exists(model,
+                        HF_CONFIG_NAME,
+                        revision=revision,
+                        token=kwargs.get("token"))
            raise ValueError(f"No supported config format found in {model}")
    if config_format == ConfigFormat.HF: