add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...
add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub
3c9817d2 · zhuwenwen · 49204f68 · 3c9817d2 · 3c9817d2 · 3c9817d2
Commit 3c9817d2 authored Nov 27, 2024 by zhuwenwen
20 changed files
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -2,11 +2,13 @@ from contextlib import nullcontext

 import numpy as np
 import pytest
+import os
 from transformers import CLIPImageProcessor, LlavaNextImageProcessor

 from vllm.config import ModelConfig
 from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.utils import rescale_image_size
+from ..utils import models_path_prefix


 @pytest.fixture
@@ -17,7 +19,7 @@ def mm_registry():
 @pytest.mark.parametrize("dtype", ["half", "float"])
 @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
 def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")

    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, CLIPImageProcessor)
@@ -60,7 +62,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
 @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
 def test_llava_next_image_processor(image_assets, mm_registry, dtype,
                                    size_factor):
-    MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"
+    MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-v1.6-vicuna-7b-hf")

    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, LlavaNextImageProcessor)
@@ -105,7 +107,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
     (2, 1, False), (2, 2, True)],
 )
 def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")

    model_config = ModelConfig(
        model=MODEL_NAME,
@@ -135,7 +137,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
 # NOTE: We don't test zero images since the HF processor doesn't support it
 @pytest.mark.parametrize("num_images", [1, 2])
 def test_image_mapper_multi(image_assets, mm_registry, num_images):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")

    model_config = ModelConfig(
        model=MODEL_NAME,

--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -4,6 +4,7 @@ from unittest.mock import patch

 import pytest
 import torch
+import os

 from vllm.inputs import InputContext, LLMInputs
 from vllm.inputs.registry import InputRegistry
@@ -11,11 +12,12 @@ from vllm.multimodal import MultiModalRegistry
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData

 from ..models.utils import build_model_context
+from ..utils import models_path_prefix

 # Used for fast tests where the model doesn't matter
-DUMMY_MODEL_ID = "facebook/opt-125m"
+DUMMY_MODEL_ID = os.path.join(models_path_prefix, "facebook/opt-125m")
 # Used for tests that need a multimodal model
-MULTIMODAL_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+MULTIMODAL_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")

 # For mm_processor_kwargs - we test overrides by defining mocks for each place
 # it is used, and ensuring that we can pass processor kwargs an override value

--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -2,21 +2,23 @@

 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
+import os
 import pytest

 from tests.conftest import cleanup
 from vllm import LLM
+from ..utils import models_path_prefix

 MODEL_LEN_LEN = [
    # Example models with sliding window.
-    ("bigcode/starcoder2-3b", 4096, 16384),
+    (os.path.join(models_path_prefix, "bigcode/starcoder2-3b"), 4096, 16384),
    # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI

    # Confirm model with sliding window works.
    # config has "use_sliding_window": false
-    ("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
+    (os.path.join(models_path_prefix, "Qwen/Qwen1.5-0.5B-Chat"), 32768, 32768),
    # config has no sliding window attribute.
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
+    (os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"), 2048, 2048),
 ]



--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -5,6 +5,7 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 from typing import List

 import pytest
+import os

 from tests.kernels.utils import override_backend_env_variable
 from vllm.block import PhysicalTokenBlock
@@ -12,9 +13,10 @@ from vllm.core.block_manager_v1 import CachedBlockAllocator
 from vllm.utils import Device

 from ..models.utils import check_outputs_equal
+from ..utils import models_path_prefix

 MODELS = [
-    "facebook/opt-125m",
+    os.path.join(models_path_prefix, "facebook/opt-125m"),
 ]



--- a/tests/prompt_adapter/test_bloom.py
+++ b/tests/prompt_adapter/test_bloom.py
 import pytest
+import os

 import vllm
 from vllm.prompt_adapter.request import PromptAdapterRequest
+from ..utils import models_path_prefix

-MODEL_PATH = "bigscience/bloomz-560m"
-PA_PATH = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
+MODEL_PATH = os.path.join(models_path_prefix, "bigscience/bloomz-560m")
+PA_PATH = os.path.join(models_path_prefix, 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM')


 def do_sample(llm, pa_name: str, pa_id: int):

--- a/tests/prompt_adapter/test_multi_adapter_inference.py
+++ b/tests/prompt_adapter/test_multi_adapter_inference.py
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
+from ..utils import models_path_prefix
+import os

-MODEL_PATH = "bigscience/bloomz-560m"
-pa_path = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
-pa_path2 = 'swapnilbp/angry_tweet_ptune'
+MODEL_PATH = os.path.join(models_path_prefix, "bigscience/bloomz-560m") 
+pa_path = os.path.join(models_path_prefix, 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM') 
+pa_path2 = os.path.join(models_path_prefix, 'swapnilbp/angry_tweet_ptune') 


 def do_sample(engine):

--- a/tests/prompt_adapter/test_pa_lora.py
+++ b/tests/prompt_adapter/test_pa_lora.py
@@ -3,10 +3,14 @@ from huggingface_hub import snapshot_download
 from vllm import EngineArgs, LLMEngine, SamplingParams
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
+from ..utils import models_path_prefix
+import os

-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
-lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+MODEL_PATH = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf") 
+# pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
+# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+pa_path = os.path.join(models_path_prefix, "swapnilbp/llama_tweet_ptune") 
+lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")


 def do_sample(engine):

--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -4,27 +4,29 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
 '''

 import gc
+import os

 import pytest
 import torch

 from tests.quantization.utils import is_quant_method_supported

-from ..utils import fork_new_process_for_each_test
+from ..utils import fork_new_process_for_each_test, models_path_prefix
+

 models_4bit_to_test = [
-    ('huggyllama/llama-7b', 'quantize model inflight'),
+    (os.path.join(models_path_prefix, 'huggyllama/llama-7b'), 'quantize model inflight'),
 ]

 models_pre_qaunt_4bit_to_test = [
-    ('lllyasviel/omost-llama-3-8b-4bits',
+    (os.path.join(models_path_prefix, 'lllyasviel/omost-llama-3-8b-4bits'),
     'read pre-quantized 4-bit NF4 model'),
-    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
+    (os.path.join(models_path_prefix, 'PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed'),
     'read pre-quantized 4-bit FP4 model'),
 ]

 models_pre_quant_8bit_to_test = [
-    ('meta-llama/Llama-Guard-3-8B-INT8', 'read pre-quantized 8-bit model'),
+    (os.path.join(models_path_prefix, 'meta-llama/Llama-Guard-3-8B-INT8'), 'read pre-quantized 8-bit model'),
 ]



--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.

 import pytest
 import torch
+import os

 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
    CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
@@ -12,12 +13,13 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
    QuantizationType)
+from ..utils import models_path_prefix


 @pytest.mark.parametrize("model_args", [
-    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
+    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), "tensor",
     QuantizationType.INT, 2560),
-    ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
+    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"), "channel",
     QuantizationType.INT, 2560),
 ])
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
@@ -61,15 +63,15 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):


 def test_compressed_tensors_no_enforce_eager(vllm_runner):
-    model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
+    model_path = os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change")
    with vllm_runner(model_path) as llm:
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        assert output


 @pytest.mark.parametrize("model_args", [
-    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
-    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
+    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"), "tensor"),
+    (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"), "channel"),
 ])
 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
    model_path, strategy = model_args
@@ -91,9 +93,9 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):

 @pytest.mark.parametrize(
    "wNa16_args",
-    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
-     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
-     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
+    [(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-channel-v2"), "channel", None, 8),
+     (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-group128-v2"), "group", 128, 8),
+     (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4)])
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
    model, strategy, group, pack_factor = wNa16_args
    with vllm_runner(model) as llm:
@@ -116,7 +118,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):


 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
-    model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
+    model_path = os.path.join(models_path_prefix,"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t")
    with vllm_runner(model_path) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]
@@ -132,7 +134,7 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):


 def test_compressed_tensors_fp8(vllm_runner):
-    model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
+    model_path = os.path.join(models_path_prefix,"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test")
    with vllm_runner(model_path) as llm:
        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        layer = model.model.layers[0]
@@ -157,7 +159,7 @@ def test_compressed_tensors_fp8(vllm_runner):


 def test_compressed_tensors_kv_cache(vllm_runner):
-    model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
+    model_path = os.path.join(models_path_prefix,"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")
    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
        output = llm.generate_greedy("Hello world!", max_tokens=20)
        assert output
\ No newline at end of file
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -7,8 +7,10 @@ from dataclasses import dataclass
 from typing import Tuple

 import pytest
+import os

 from vllm.config import ModelConfig
+from ..utils import models_path_prefix


 @dataclass
@@ -22,32 +24,32 @@ MODEL_ARG_EXPTYPES = [
    # AUTOGPTQ
    # compat: autogptq <=0.7.1 is_marlin_format: bool
    # Model Serialized in Marlin Format should always use Marlin kernel.
-    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
-    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
-    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
-    ("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
+    (os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), None, "marlin"),
+    (os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), "marlin", "marlin"),
+    (os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), "gptq", "marlin"),
+    (os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), "awq", "ERROR"),
    # Model Serialized in Exllama Format.
-    ("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
-    ("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
-    ("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
-    ("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
+    (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), None, "gptq_marlin"),
+    (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "marlin", "gptq_marlin"),
+    (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
+    (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "awq", "ERROR"),
    # compat: autogptq >=0.8.0 use checkpoint_format: str
    # Model Serialized in Marlin Format should always use Marlin kernel.
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
+    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), None, "marlin"),
+    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), "marlin", "marlin"),
+    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), "gptq", "marlin"),
+    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), "awq", "ERROR"),
    # Model Serialized in Exllama Format.
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
-    ("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
+    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), None, "gptq_marlin"),
+    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "marlin", "gptq_marlin"),
+    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "gptq", "gptq"),
+    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "awq", "ERROR"),

    # AUTOAWQ
-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
-    ("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
+    (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), None, "awq_marlin"),
+    (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "awq", "awq"),
+    (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "marlin", "awq_marlin"),
+    (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "gptq", "ERROR"),
 ]



--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -2,22 +2,23 @@
 # Base tests: tests/basic_correctness/test_cpu_offload.py

 import pytest
+import os

 from tests.quantization.utils import is_quant_method_supported

-from ..utils import compare_two_settings
+from ..utils import compare_two_settings, models_path_prefix


 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
    # Test quantization of an unquantized checkpoint
-    compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
+    compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
                         ["--quantization", "fp8"],
                         ["--quantization", "fp8", "--cpu-offload-gb", "2"],
                         max_wait_seconds=480)
    # Test loading a quantized checkpoint
-    compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
+    compare_two_settings(os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), [],
                         ["--cpu-offload-gb", "2"],
                         max_wait_seconds=480)

@@ -26,11 +27,11 @@ def test_cpu_offload_fp8():
                    reason="gptq_marlin is not supported on this GPU type.")
 def test_cpu_offload_gptq():
    # Test GPTQ Marlin
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
+    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [],
                         ["--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
    # Test GPTQ
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"),
                         ["--quantization", "gptq"],
                         ["--quantization", "gptq", "--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
@@ -40,11 +41,11 @@ def test_cpu_offload_gptq():
                    reason="awq_marlin is not supported on this GPU type.")
 def test_cpu_offload_awq():
    # Test AWQ Marlin
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
+    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [],
                         ["--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
    # Test AWQ
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
+    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"),
                         ["--quantization", "awq"],
                         ["--quantization", "awq", "--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
@@ -54,15 +55,15 @@ def test_cpu_offload_awq():
                    reason="gptq_marlin is not supported on this GPU type.")
 def test_cpu_offload_compressed_tensors():
    # Test wNa16
-    compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
+    compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [],
                         ["--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
    # Test w4a16_marlin24
-    compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+    compare_two_settings(os.path.join(models_path_prefix, "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"),
                         [], ["--cpu-offload-gb", "1"],
                         max_wait_seconds=480)
    # Test w8a8
    compare_two_settings(
-        "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
+        os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), [],
        ["--cpu-offload-gb", "1"],
        max_wait_seconds=480)
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -3,10 +3,12 @@
 doesn't test correctness
 """
 import pytest
+import os

 from tests.quantization.utils import is_quant_method_supported
+from ..utils import models_path_prefix

-MODELS = ["ai21labs/Jamba-tiny-random"]
+MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]


 @pytest.mark.skipif(not is_quant_method_supported("experts_int8"),

--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -4,17 +4,19 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
 """
 import pytest
 import torch
+import os

 from tests.quantization.utils import is_quant_method_supported
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
                                                         Fp8LinearMethod)
 from vllm.platforms import current_platform
+from ..utils import models_path_prefix

 MODELS = [
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/Phi-3-mini-128k-instruct-FP8",
-    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
+    os.path.join(models_path_prefix, "nm-testing/Phi-3-mini-128k-instruct-FP8"),
+    os.path.join(models_path_prefix, "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV"),
 ]


@@ -37,9 +39,9 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,

 KV_CACHE_MODELS = [
    # Deprecated AutoFP8 format using .kv_scale
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
    # AutoFP8 format using separate .k_scale and .v_scale
-    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    os.path.join(models_path_prefix, "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"),
 ]


@@ -73,7 +75,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
    if force_marlin:
        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")

-    with vllm_runner("facebook/opt-125m",
+    with vllm_runner(os.path.join(models_path_prefix, "facebook/opt-125m"),
                     quantization="fp8",
                     kv_cache_dtype=kv_cache_dtype) as llm:


--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -6,6 +6,7 @@ from typing import Tuple

 import pytest
 import torch
+import os

 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.gptq_marlin import (
@@ -13,13 +14,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
 from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    UnquantizedEmbeddingMethod)
+from ..utils import models_path_prefix

 PROMPT = "On the surface of Mars, we found"

 MODELS_QUANT = [(
-    "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
-    True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
-                ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
+    os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"),
+    True), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), False),
+                (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)]


 @pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)

--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -4,6 +4,8 @@ Run `pytest tests/samplers/test_beam_search.py`.
 """

 import pytest
+import os
+from ..utils import models_path_prefix

 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
@@ -11,7 +13,7 @@ import pytest
 #   3. Use the model "huggyllama/llama-7b".
 MAX_TOKENS = [64]
 BEAM_WIDTHS = [4]
-MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
+MODELS = [os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")]


 @pytest.mark.parametrize("model", MODELS)

--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -4,12 +4,14 @@ Run `pytest tests/samplers/test_ignore_eos.py`.
 """

 import pytest
+import os

 from vllm import SamplingParams
+from ..utils import models_path_prefix

 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
-MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
+MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m"), os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")]


 @pytest.mark.parametrize("model", MODELS)

--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
 import pytest
 import torch
+import os

 from vllm import SamplingParams
+from ..utils import models_path_prefix

-MODELS = ["facebook/opt-125m"]
+MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]


 @pytest.mark.parametrize("model", MODELS)

--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -2,12 +2,14 @@ from typing import List

 import pytest
 import torch
+import os

 from vllm import SamplingParams

 from ..conftest import VllmRunner
+from ..utils import models_path_prefix

-MODELS = ["facebook/opt-125m"]
+MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]


 @pytest.mark.parametrize("model", MODELS)
@@ -130,7 +132,7 @@ def test_get_prompt_logprobs(


 def test_max_logprobs():
-    runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
+    runner = VllmRunner(os.path.join(models_path_prefix, "facebook/opt-125m"), max_logprobs=1)
    vllm_sampling_params = SamplingParams(logprobs=1)
    # should pass
    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)

--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
 import pytest
+import os

 from vllm import SamplingParams
+from ..utils import models_path_prefix

-MODELS = ["facebook/opt-125m"]
+MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]


 @pytest.mark.parametrize("model", MODELS)

--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -7,11 +7,13 @@ import random
 from itertools import combinations

 import pytest
+import os

 from vllm import SamplingParams
 from vllm.model_executor.utils import set_random_seed
+from ..utils import models_path_prefix

-MODEL = "facebook/opt-125m"
+MODEL = os.path.join(models_path_prefix, "facebook/opt-125m")
 RANDOM_SEEDS = list(range(5))