[Bugfix] Add file lock for ModelScope download (#14060)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>

[Bugfix] Add file lock for ModelScope download (#14060)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
6a84164a · Jee Jee Li · GitHub · f64ffa8c · 6a84164a · 6a84164a
Unverified Commit 6a84164a authored Mar 01, 2025 by Jee Jee Li Committed by GitHub Mar 01, 2025
4 changed files
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -14,6 +14,8 @@ from tqdm.asyncio import tqdm
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)

+from vllm.model_executor.model_loader.weight_utils import get_lock
+
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)


@@ -430,12 +432,15 @@ def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download

-        model_path = snapshot_download(
-            model_id=pretrained_model_name_or_path,
-            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(pretrained_model_name_or_path):
+            model_path = snapshot_download(
+                model_id=pretrained_model_name_or_path,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])

-        return model_path
+            return model_path
    return pretrained_model_name_or_path



--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -49,7 +49,7 @@ from vllm.model_executor.model_loader.utils import (ParamMapping,
 from vllm.model_executor.model_loader.weight_utils import (
    download_safetensors_index_file_from_hf, download_weights_from_hf,
    filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
-    get_gguf_extra_tensor_names, gguf_quant_weights_iterator,
+    get_gguf_extra_tensor_names, get_lock, gguf_quant_weights_iterator,
    initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
    runai_safetensors_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
@@ -235,13 +235,17 @@ class DefaultModelLoader(BaseModelLoader):
            from modelscope.hub.snapshot_download import snapshot_download

            if not os.path.exists(model):
-                model_path = snapshot_download(
-                    model_id=model,
-                    cache_dir=self.load_config.download_dir,
-                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                    revision=revision,
-                    ignore_file_pattern=self.load_config.ignore_patterns,
-                )
+                # Use file lock to prevent multiple processes from
+                # downloading the same model weights at the same time.
+                with get_lock(model, self.load_config.download_dir):
+                    model_path = snapshot_download(
+                        model_id=model,
+                        cache_dir=self.load_config.download_dir,
+                        local_files_only=huggingface_hub.constants.
+                        HF_HUB_OFFLINE,
+                        revision=revision,
+                        ignore_file_pattern=self.load_config.ignore_patterns,
+                    )
            else:
                model_path = model
            return model_path

--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -8,6 +8,7 @@ import os
 import tempfile
 import time
 from collections import defaultdict
+from pathlib import Path
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union

 import filelock
@@ -67,8 +68,10 @@ class DisabledTqdm(tqdm):
        super().__init__(*args, **kwargs, disable=True)


-def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
+def get_lock(model_name_or_path: Union[str, Path],
+             cache_dir: Optional[str] = None):
    lock_dir = cache_dir or temp_dir
+    model_name_or_path = str(model_name_or_path)
    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
    model_name = model_name_or_path.replace("/", "-")
    hash_name = hashlib.sha256(model_name.encode()).hexdigest()

--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -150,16 +150,22 @@ def get_tokenizer(
        # pylint: disable=C.
        from modelscope.hub.snapshot_download import snapshot_download

+        # avoid circuit import
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
        # Only set the tokenizer here, model will be downloaded on the workers.
        if not os.path.exists(tokenizer_name):
-            tokenizer_path = snapshot_download(
-                model_id=tokenizer_name,
-                cache_dir=download_dir,
-                revision=revision,
-                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                # Ignore weights - we only need the tokenizer.
-                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
-            tokenizer_name = tokenizer_path
+            # Use file lock to prevent multiple processes from
+            # downloading the same file at the same time.
+            with get_lock(tokenizer_name, download_dir):
+                tokenizer_path = snapshot_download(
+                    model_id=tokenizer_name,
+                    cache_dir=download_dir,
+                    revision=revision,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    # Ignore weights - we only need the tokenizer.
+                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+                tokenizer_name = tokenizer_path

    if tokenizer_mode == "slow":
        if kwargs.get("use_fast", False):