[Core] Support offline use of local cache for models (#4374)

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com> Co-authored-by: Travis Johnson <tjohnson31415@gmail.com>

[Core] Support offline use of local cache for models (#4374)
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com> Co-authored-by: Travis Johnson <tjohnson31415@gmail.com>
d6e520e1 · Prashant Gupta · GitHub · 81661da7 · d6e520e1 · d6e520e1
Unverified Commit d6e520e1 authored Apr 27, 2024 by Prashant Gupta Committed by GitHub Apr 27, 2024
4 changed files
--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
 import os
+import tempfile

 import huggingface_hub.constants
 import pytest
+from huggingface_hub.utils import LocalEntryNotFoundError

-from vllm.model_executor.model_loader.weight_utils import enable_hf_transfer
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, enable_hf_transfer)


 def test_hf_transfer_auto_activation():
@@ -22,5 +25,30 @@ def test_hf_transfer_auto_activation():
            HF_TRANFER_ACTIVE)


+def test_download_weights_from_hf():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # assert LocalEntryNotFoundError error is thrown
+        # if offline is set and model is not cached
+        huggingface_hub.constants.HF_HUB_OFFLINE = True
+        with pytest.raises(LocalEntryNotFoundError):
+            download_weights_from_hf("facebook/opt-125m",
+                                     allow_patterns=["*.safetensors", "*.bin"],
+                                     cache_dir=tmpdir)
+
+        # download the model
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("facebook/opt-125m",
+                                 allow_patterns=["*.safetensors", "*.bin"],
+                                 cache_dir=tmpdir)
+
+        # now it should work offline
+        huggingface_hub.constants.HF_HUB_OFFLINE = True
+        assert download_weights_from_hf(
+            "facebook/opt-125m",
+            allow_patterns=["*.safetensors", "*.bin"],
+            cache_dir=tmpdir) is not None
+
+
 if __name__ == "__main__":
    test_hf_transfer_auto_activation()
+    test_download_weights_from_hf()
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -5,6 +5,7 @@ import os
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Generator, List, Optional, Tuple, Type

+import huggingface_hub
 import torch
 from torch import nn

@@ -131,7 +132,9 @@ class DefaultModelLoader(BaseModelLoader):
                model_path = snapshot_download(
                    model_id=model,
                    cache_dir=self.load_config.download_dir,
-                    revision=revision)
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    revision=revision,
+                )
            else:
                model_path = model
            return model_path

--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -127,11 +127,14 @@ def get_quant_config(model_config: ModelConfig,
    if not is_local:
        # Download the config files.
        with get_lock(model_name_or_path, load_config.download_dir):
-            hf_folder = snapshot_download(model_name_or_path,
+            hf_folder = snapshot_download(
+                model_name_or_path,
                revision=model_config.revision,
                allow_patterns="*.json",
                cache_dir=load_config.download_dir,
-                                          tqdm_class=DisabledTqdm)
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                tqdm_class=DisabledTqdm,
+            )
    else:
        hf_folder = model_name_or_path

@@ -161,10 +164,12 @@ def get_quant_config(model_config: ModelConfig,
    return quant_cls.from_config(config)


-def download_weights_from_hf(model_name_or_path: str,
+def download_weights_from_hf(
+    model_name_or_path: str,
    cache_dir: Optional[str],
    allow_patterns: List[str],
-                             revision: Optional[str] = None) -> str:
+    revision: Optional[str] = None,
+) -> str:
    """Download model weights from Hugging Face Hub.

    Args:
@@ -179,6 +184,7 @@ def download_weights_from_hf(model_name_or_path: str,
    Returns:
        str: The path to the downloaded model weights.
    """
+    if not huggingface_hub.constants.HF_HUB_OFFLINE:
        # Before we download we look at that is available:
        fs = HfFileSystem()
        file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
@@ -194,11 +200,14 @@ def download_weights_from_hf(model_name_or_path: str,
    # Use file lock to prevent multiple processes from
    # downloading the same model weights at the same time.
    with get_lock(model_name_or_path, cache_dir):
-        hf_folder = snapshot_download(model_name_or_path,
+        hf_folder = snapshot_download(
+            model_name_or_path,
            allow_patterns=allow_patterns,
            cache_dir=cache_dir,
            tqdm_class=DisabledTqdm,
-                                      revision=revision)
+            revision=revision,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+        )
    return hf_folder



--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
 import os
 from typing import Optional, Union

+import huggingface_hub
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)

@@ -76,6 +77,7 @@ def get_tokenizer(
                model_id=tokenizer_name,
                cache_dir=download_dir,
                revision=revision,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
                # Ignore weights - we only need the tokenizer.
                ignore_file_pattern=["*.pt", "*.safetensors", "*.bin"])
            tokenizer_name = tokenizer_path