add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...
add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub
3c9817d2 · zhuwenwen · 49204f68 · 3c9817d2 · 3c9817d2 · 3c9817d2
Commit 3c9817d2 authored Nov 27, 2024 by zhuwenwen
20 changed files
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
 import subprocess
 import sys
+import os
 import tempfile
 from vllm.entrypoints.openai.protocol import BatchRequestOutput
+from ...utils import models_path_prefix
 # ruff: noqa: E501
-INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": os.path.join(models_path_prefix, "NousResearch/Meta-Llama-3-8B-Instruct"), "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": os.path.join(models_path_prefix, "NousResearch/Meta-Llama-3-8B-Instruct"), "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-4", "method": "POST", "url": "/bad_url", "body": {"model": os.path.join(models_path_prefix, "NousResearch/Meta-Llama-3-8B-Instruct"), "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-5", "method": "POST", "url": "/v1/chat/completions", "body": {"stream": "True", "model": os.path.join(models_path_prefix, "NousResearch/Meta-Llama-3-8B-Instruct"), "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
-INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": os.path.join(models_path_prefix, "NousResearch/Meta-Llama-3-8B-Instruct"), "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": os.path.join(models_path_prefix, "NousResearch/Meta-Llama-3-8B-Instruct"), "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
-INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
+INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"), "input": "You are a helpful assistant."}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"), "input": "You are an unhelpful assistant."}}
-{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"), "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
@@ -31,7 +33,7 @@ def test_empty_file():
        proc = subprocess.Popen([
            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
            input_file.name, "-o", output_file.name, "--model",
-            "intfloat/e5-mistral-7b-instruct"
+            os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
        ], )
        proc.communicate()
        proc.wait()
@@ -50,7 +52,7 @@ def test_completions():
        proc = subprocess.Popen([
            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
            input_file.name, "-o", output_file.name, "--model",
-            "NousResearch/Meta-Llama-3-8B-Instruct"
+            os.path.join(models_path_prefix, "NousResearch/Meta-Llama-3-8B-Instruct")
        ], )
        proc.communicate()
        proc.wait()
@@ -75,7 +77,7 @@ def test_completions_invalid_input():
        proc = subprocess.Popen([
            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
            input_file.name, "-o", output_file.name, "--model",
-            "NousResearch/Meta-Llama-3-8B-Instruct"
+            os.path.join(models_path_prefix, "NousResearch/Meta-Llama-3-8B-Instruct")
        ], )
        proc.communicate()
        proc.wait()
@@ -91,7 +93,7 @@ def test_embeddings():
        proc = subprocess.Popen([
            sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
            input_file.name, "-o", output_file.name, "--model",
-            "intfloat/e5-mistral-7b-instruct"
+            os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
        ], )
        proc.communicate()
        proc.wait()

--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -9,8 +9,9 @@ from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_engine import BaseModelPath
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from ...utils import models_path_prefix
-MODEL_NAME = "openai-community/gpt2"
+MODEL_NAME = os.path.join(models_path_prefix, "openai-community/gpt2")
 CHAT_TEMPLATE = "Dummy chat template for testing {}"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]

--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -2,6 +2,7 @@ from http import HTTPStatus
 from unittest.mock import MagicMock
 import pytest
+import os
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
@@ -9,8 +10,9 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
                                              LoadLoraAdapterRequest,
                                              UnloadLoraAdapterRequest)
 from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
+from ...utils import models_path_prefix
-MODEL_NAME = "meta-llama/Llama-2-7b"
+MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b")
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
 LORA_LOADING_SUCCESS_MESSAGE = (
    "Success: LoRA adapter '{lora_name}' added successfully.")

--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -4,9 +4,9 @@ import os
 import openai
 import pytest
-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
 import openai  # use the official client for correctness check
 import pytest
+import os
 import pytest_asyncio
 import requests
@@ -8,9 +9,10 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 from ...utils import RemoteOpenAIServer
 from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
 from .test_completion import zephyr_lora_files  # noqa: F401
+from ...utils import models_path_prefix
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
 @pytest.fixture(scope="module")

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -2,13 +2,14 @@ from typing import Dict, List
 import openai
 import pytest
+import os
 import pytest_asyncio
 from vllm.multimodal.utils import encode_image_base64, fetch_image
-from ...utils import RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer, models_path_prefix
-MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
+MODEL_NAME = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
 MAXIMUM_IMAGES = 2
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)

--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -2,6 +2,7 @@ import warnings
 from typing import Optional
 import pytest
+import os
 from PIL import Image
 from vllm.assets.image import ImageAsset
@@ -11,8 +12,9 @@ from vllm.entrypoints.chat_utils import (parse_chat_messages,
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+from ..utils import models_path_prefix
-PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+PHI3V_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
 @pytest.fixture(scope="module")

--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -2,14 +2,17 @@ from pathlib import Path
 from typing import List
 import pytest
+import os
 import torch
 from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
 from huggingface_hub import snapshot_download
 import vllm._custom_ops as ops
 from vllm.utils import seed_everything
+from ..utils import models_path_prefix
-GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
+# GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
+GGUF_SAMPLE = os.path.join(models_path_prefix, "Isotr0py/test-gguf-sample")
 def get_gguf_sample_tensors(

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -6,6 +6,7 @@ from typing import Dict, List, TypedDict
 from unittest.mock import MagicMock, patch
 import pytest
+import os
 import ray
 import torch
 import torch.nn as nn
@@ -24,6 +25,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
+from utils import models_path_prefix
 class ContextIDInfo(TypedDict):
@@ -158,7 +160,7 @@ def dummy_model_gate_up() -> nn.Module:
 @pytest.fixture(scope="session")
 def sql_lora_huggingface_id():
    # huggingface repo id is used to test lora runtime downloading.
-    return "yard1/llama-2-7b-sql-lora-test"
+    return os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
 @pytest.fixture(scope="session")
@@ -170,53 +172,63 @@ def sql_lora_files(sql_lora_huggingface_id):
 def mixtral_lora_files():
    # Note: this module has incorrect adapter_config.json to test
    # https://github.com/vllm-project/vllm/pull/5909/files.
-    return snapshot_download(repo_id="SangBinCho/mixtral-lora")
+    # return snapshot_download(repo_id="SangBinCho/mixtral-lora")
+    return os.path.join(models_path_prefix, "SangBinCho/mixtral-lora")
 @pytest.fixture(scope="session")
 def gemma_lora_files():
-    return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
+    # return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
+    return os.path.join(models_path_prefix, "wskwon/gemma-7b-test-lora")
 @pytest.fixture(scope="session")
 def chatglm3_lora_files():
-    return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
+    # return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
+    return os.path.join(models_path_prefix, "jeeejeee/chatglm3-text2sql-spider")
 @pytest.fixture(scope="session")
 def baichuan_lora_files():
-    return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")
+    # return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")
+    return os.path.join(models_path_prefix, "jeeejeee/baichuan7b-text2sql-spider")
 @pytest.fixture(scope="session")
 def baichuan_zero_lora_files():
    # all the lora_B weights are initialized to zero.
-    return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
+    # return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
+    return os.path.join(models_path_prefix, "jeeejeee/baichuan7b-zero-init")
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
-    return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
+    # return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
+    return os.path.join(models_path_prefix, "jashing/tinyllama-colorist-lora")
 @pytest.fixture(scope="session")
 def phi2_lora_files():
-    return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
+    # return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
+    return os.path.join(models_path_prefix, "isotr0py/phi-2-test-sql-lora")
 @pytest.fixture(scope="session")
 def long_context_lora_files_16k_1():
-    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
+    # return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
+    return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_1")
 @pytest.fixture(scope="session")
 def long_context_lora_files_16k_2():
-    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
+    # return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
+    return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_2")
 @pytest.fixture(scope="session")
 def long_context_lora_files_32k():
-    return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
+    # return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
+    return os.path.join(models_path_prefix, "SangBinCho/long_context_32k_testing")
 @pytest.fixture(scope="session")
@@ -254,7 +266,7 @@ def llama_2_7b_engine_extra_embeddings():
                             **kwargs)
    with patch("vllm.worker.model_runner.get_model", get_model_patched):
-        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
+        engine = vllm.LLM(os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), enable_lora=False)
    yield engine.llm_engine
    del engine
    cleanup()

--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
 from typing import List
 import pytest
+import os
 import vllm
 from vllm.lora.request import LoRARequest
 from .conftest import cleanup
+from ..utils import models_path_prefix
-MODEL_PATH = "baichuan-inc/Baichuan-7B"
+MODEL_PATH = os.path.join(models_path_prefix, "baichuan-inc/Baichuan-7B")
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501

--- a/tests/lora/test_chatglm3.py
+++ b/tests/lora/test_chatglm3.py
 from typing import List
+import os
 import vllm
 from vllm.lora.request import LoRARequest
+from ..utils import models_path_prefix
-MODEL_PATH = "THUDM/chatglm3-6b"
+MODEL_PATH = os.path.join(models_path_prefix, "THUDM/chatglm3-6b")
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501

--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
 from typing import List
 import pytest
+import os
 import vllm
 from vllm.lora.request import LoRARequest
 from vllm.utils import is_hip
+from ..utils import models_path_prefix
-MODEL_PATH = "google/gemma-7b"
+MODEL_PATH = os.path.join(models_path_prefix, "google/gemma-7b")
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:

--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
 from typing import List
 import pytest
+import os
 import ray
 import vllm
 from vllm.lora.request import LoRARequest
 from .conftest import cleanup
+from ..utils import models_path_prefix
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+MODEL_PATH = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:

--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -3,6 +3,7 @@ from typing import List, Optional, Tuple
 import numpy as np
 import pytest
+import os
 import vllm
 from vllm import SamplingParams
@@ -12,6 +13,7 @@ from vllm.model_executor.layers.rotary_embedding import (
    LinearScalingRotaryEmbedding)
 from .data.long_context_test_data import prompts_and_responses
+from ..utils import models_path_prefix
 context_len_to_scaling_factor = {
    "16k": 4,
@@ -108,7 +110,7 @@ def lora_llm(long_context_infos):
        for info in long_context_infos.values()
    ]
-    llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf",
+    llm = vllm.LLM(os.path.join(models_path_prefix, "meta-llama/Llama-2-13b-chat-hf"),
                   enable_lora=True,
                   max_num_seqs=16,
                   max_loras=8,
@@ -124,7 +126,7 @@ def test_rotary_emb_replaced(dist_init):
    """Verify rotary emb in all the layers are replaced"""
    from vllm.engine.arg_utils import EngineArgs
    from vllm.worker.model_runner import ModelRunner
-    engine_args = EngineArgs("meta-llama/Llama-2-7b-hf",
+    engine_args = EngineArgs(os.path.join(models_path_prefix, "meta-llama/Llama-2-13b-chat-hf"),
                             long_lora_scaling_factors=(4.0, ),
                             enable_lora=True)
    engine_config = engine_args.create_engine_config()

--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -2,11 +2,13 @@ from typing import List
 import pytest
 import torch
+import os
 import vllm
 from vllm.lora.request import LoRARequest
+from ..utils import models_path_prefix
-MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+MODEL_PATH = os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1")
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:

--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
 from typing import List
+import os
 import vllm
 from vllm.lora.request import LoRARequest
+from ..utils import models_path_prefix
-MODEL_PATH = "microsoft/phi-2"
+MODEL_PATH = os.path.join(models_path_prefix, "microsoft/phi-2")
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501

--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -4,12 +4,14 @@ from dataclasses import dataclass
 from typing import List
 import pytest
+import os
 import vllm
 from vllm.lora.request import LoRARequest
 from vllm.utils import is_hip
 from .conftest import cleanup
+from ..utils import models_path_prefix
 @dataclass
@@ -23,16 +25,16 @@ MODELS: List[ModelWithQuantization]
 if is_hip():
    MODELS = [
        ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            model_path=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
            quantization="GPTQ"),
    ]
 else:
    MODELS = [
        ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            model_path=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"),
            quantization="AWQ"),
        ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            model_path=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
            quantization="GPTQ"),
    ]

--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -8,14 +8,15 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.worker.worker import Worker
+from ..utils import models_path_prefix
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
    worker = Worker(
        model_config=ModelConfig(
-            "meta-llama/Llama-2-7b-hf",
+            os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
-            "meta-llama/Llama-2-7b-hf",
+            os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
            tokenizer_mode="auto",
            trust_remote_code=False,
            seed=0,

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
+import os
 import time
 from typing import List
@@ -13,9 +14,10 @@ from vllm.sampling_params import SamplingParams
 import vllm.envs as envs
 from ..conftest import cleanup
+from ..utils import models_path_prefix
 MODELS = [
-    "facebook/opt-125m",
+    os.path.join(models_path_prefix, "facebook/opt-125m"),
 ]

--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
@@ -7,6 +7,7 @@ from huggingface_hub.utils import LocalEntryNotFoundError
 from vllm.model_executor.model_loader.weight_utils import (
    download_weights_from_hf, enable_hf_transfer)
+from ..utils import models_path_prefix
 def test_hf_transfer_auto_activation():
@@ -31,20 +32,20 @@ def test_download_weights_from_hf():
        # if offline is set and model is not cached
        huggingface_hub.constants.HF_HUB_OFFLINE = True
        with pytest.raises(LocalEntryNotFoundError):
-            download_weights_from_hf("facebook/opt-125m",
+            download_weights_from_hf(os.path.join(models_path_prefix, "facebook/opt-125m"),
                                     allow_patterns=["*.safetensors", "*.bin"],
                                     cache_dir=tmpdir)
        # download the model
        huggingface_hub.constants.HF_HUB_OFFLINE = False
-        download_weights_from_hf("facebook/opt-125m",
+        download_weights_from_hf(os.path.join(models_path_prefix, "facebook/opt-125m"),
                                 allow_patterns=["*.safetensors", "*.bin"],
                                 cache_dir=tmpdir)
        # now it should work offline
        huggingface_hub.constants.HF_HUB_OFFLINE = True
        assert download_weights_from_hf(
-            "facebook/opt-125m",
+            os.path.join(models_path_prefix, "facebook/opt-125m"),
            allow_patterns=["*.safetensors", "*.bin"],
            cache_dir=tmpdir) is not None