add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...
add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub
3c9817d2 · zhuwenwen · 49204f68 · 3c9817d2 · 3c9817d2 · 3c9817d2
Commit 3c9817d2 authored Nov 27, 2024 by zhuwenwen
20 changed files
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -6,6 +6,8 @@ from pathlib import Path

 import pytest
 import requests
+import os
+from ..utils import models_path_prefix


 def _query_server(prompt: str, max_tokens: int = 5) -> dict:
@@ -30,7 +32,7 @@ def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
        "api_server_async_engine.py").absolute()
    commands = [
        sys.executable, "-u",
-        str(script_path), "--model", "facebook/opt-125m", "--host",
+        str(script_path), "--model", os.path.join(models_path_prefix, "facebook/opt-125m"), "--host",
        "127.0.0.1", "--tokenizer-pool-size",
        str(tokenizer_pool_size)
    ]

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -18,7 +18,8 @@ from vllm.sampling_params import RequestOutputKind

 from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
-
+import os
+from ..utils import models_path_prefix

 @dataclass
 class RequestOutput:
@@ -136,7 +137,7 @@ def start_engine():
    print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")

    return AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m",
+        AsyncEngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
                        enforce_eager=True,
                        num_scheduler_steps=num_scheduler_steps))


--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -16,10 +16,12 @@ from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata

 from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
+import os
+from ..utils import models_path_prefix

 MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    os.path.join(models_path_prefix, "facebook/opt-125m"),
+    os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
 ]

 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@@ -27,7 +29,7 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")

 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM("facebook/opt-125m")
+    llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"))
    weak_llm = weakref.ref(llm)
    del llm
    # If there's any circular reference to vllm, this fails
@@ -78,14 +80,14 @@ def test_models(
 # @pytest.mark.parametrize(
 #     "model, distributed_executor_backend, attention_backend, "
 #     "test_suite", [
-#         ("facebook/opt-125m", "ray", "", "L4"),
-#         ("facebook/opt-125m", "mp", "", "L4"),
-#         ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-#         ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
-#         ("facebook/opt-125m", "ray", "", "A100"),
-#         ("facebook/opt-125m", "mp", "", "A100"),
-#         ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-#         ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4"),
+#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4"),
+#         (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "ray", "", "L4"),
+#         (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "mp", "", "L4"),
+#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "A100"),
+#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "A100"),
+#         (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "FLASHINFER", "A100"),
+#         (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray", "FLASHINFER", "A100"),
 #     ])
 # def test_models_distributed(
 #     hf_runner,
@@ -138,7 +140,7 @@ def test_model_with_failure(vllm_runner) -> None:
        with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
                   side_effect=ValueError()):
            with pytest.raises(ValueError) as exc_info:
-                vllm_runner("facebook/opt-125m",
+                vllm_runner(os.path.join(models_path_prefix, "facebook/opt-125m"),
                            dtype="half",
                            enforce_eager=False,
                            gpu_memory_utilization=0.7)

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -13,10 +13,12 @@ import pytest

 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
+import os
+from ..utils import models_path_prefix

 MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    os.path.join(models_path_prefix, "facebook/opt-125m"),
+    os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
 ]


@@ -207,7 +209,7 @@ def test_with_prefix_caching(
    Checks exact match decode with and without prefix caching
    with chunked prefill enabled.
    """
-    model = "meta-llama/Llama-2-7b-chat-hf"
+    model = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf")
    # The common prompt has 142 tokens with Llama-2 tokenizer.
    common_prompt = "You are a helpful AI assistant " * 20
    unique_prompts = [

--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
-from ..utils import compare_two_settings
+import os
+from ..utils import compare_two_settings, models_path_prefix


 def test_cpu_offload():
-    compare_two_settings("meta-llama/Llama-2-7b-hf", [],
+    compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), [],
                         ["--cpu-offload-gb", "4"])
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -15,9 +15,12 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,

 from ..models.utils import check_outputs_equal

+from ..utils import models_path_prefix
+import os
+
 MODELS = [
-    "facebook/opt-125m",
-]
+        os.path.join(models_path_prefix, "facebook/opt-125m"),
+    ]


 @pytest.fixture(scope="module", autouse=True)

--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -6,64 +6,66 @@ from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
 from vllm.plugins import set_torch_compile_backend
 from vllm.utils import is_hip
+import os
+from ..utils import models_path_prefix

 TEST_MODELS_SMOKE = [
-    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
+    (os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
        "quantization": "compressed-tensors"
    }),
-    ("meta-llama/Meta-Llama-3-8B", {}),
+    (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), {}),
 ]

 TEST_MODELS = [
-    ("facebook/opt-125m", {}),
-    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+    (os.path.join(models_path_prefix, "facebook/opt-125m"), {}),
+    (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), {
        "dtype": torch.float16,
        "quantization": "compressed-tensors"
    }),
-    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
+    (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), {
        "dtype": torch.float16,
        "quantization": "fp8"
    }),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
+    (os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
        "quantization": "compressed-tensors"
    }),
-    ("meta-llama/Meta-Llama-3-8B", {}),
+    (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), {}),
 ]

 # TODO: enable in pytorch 2.5
 if False and is_quant_method_supported("aqlm"):  # noqa: SIM223
-    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+    TEST_MODELS.append((os.path.join(models_path_prefix, "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"), {
        "quantization": "aqlm"
    }))

 # TODO: enable in pytorch 2.5
 if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"), {
        "quantization": "gguf"
    }))

 if is_quant_method_supported("gptq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"), {
        "quantization": "gptq"
    }))

 if is_quant_method_supported("gptq_marlin"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), {
        "quantization": "gptq_marlin"
    }))

 if is_quant_method_supported("gptq_marlin_24"):
-    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+    TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), {
        "quantization": "gptq_marlin_24"
    }))

 if is_quant_method_supported("marlin"):
-    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+    TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), {
        "quantization": "marlin"
    }))

 if not is_hip() and is_quant_method_supported("awq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+    TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), {
        "quantization": "AWQ"
    }))


--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -37,6 +37,7 @@ from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
                        identity, is_cpu)
+from utils import models_path_prefix

 logger = init_logger(__name__)

@@ -875,8 +876,9 @@ def num_gpus_available():
    return cuda_device_count_stateless()


-temp_dir = tempfile.gettempdir()
-_dummy_path = os.path.join(temp_dir, "dummy_opt")
+# temp_dir = tempfile.gettempdir()
+# _dummy_path = os.path.join(temp_dir, "dummy_opt")
+_dummy_path = os.path.join(models_path_prefix, "facebook/opt-125m") 


 @pytest.fixture

--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -5,13 +5,15 @@ import pytest
 from vllm import SamplingParams

 from .conftest import get_token_ids_from_llm_generator
+import os
+from ....utils import models_path_prefix


 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+        "model": os.path.join(models_path_prefix, "facebook/opt-125m"),

        # skip cuda graph creation for fast test.
        "enforce_eager": True,
@@ -89,7 +91,7 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
    "common_llm_kwargs",
    [{
        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+        "model": os.path.join(models_path_prefix, "facebook/opt-125m"),

        # skip cuda graph creation for fast test.
        "enforce_eager": True,
@@ -156,7 +158,7 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
    "common_llm_kwargs",
    [{
        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+        "model": os.path.join(models_path_prefix, "facebook/opt-125m"),

        # Our prompts will generate 128 tokens; since the prompts themselves are
        # small, we don't need much KV space beyond 128.
@@ -256,7 +258,7 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
    [
        {
            # Use a small model for a fast test.
-            "model": "facebook/opt-125m",
+            "model": os.path.join(models_path_prefix, "facebook/opt-125m"),

            # skip cuda graph creation for fast test.
            "enforce_eager": True,
@@ -337,7 +339,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
    "common_llm_kwargs",
    [{
        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+        "model": os.path.join(models_path_prefix, "facebook/opt-125m"),

        # skip cuda graph creation for fast test.
        "enforce_eager": True,
@@ -418,7 +420,7 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
    "common_llm_kwargs",
    [{
        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+        "model": os.path.join(models_path_prefix, "facebook/opt-125m"),

        # skip cuda graph creation for fast test.
        "enforce_eager": True,
@@ -495,7 +497,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
    "common_llm_kwargs",
    [{
        # Use a small model for a fast test.
-        "model": "facebook/opt-125m",
+        "model": os.path.join(models_path_prefix, "facebook/opt-125m"),

        # skip cuda graph creation for fast test.
        "enforce_eager": True,

--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -2,13 +2,15 @@ import random
 from typing import List

 import pytest
+import os

 from vllm import LLM, SamplingParams

 from .conftest import get_text_from_llm_generator
+from ...utils import models_path_prefix

 # relatively small model with 4k sliding window
-MODEL = "bigcode/starcoder2-3b"
+MODEL = os.path.join(models_path_prefix, "bigcode/starcoder2-3b")
 BLOCK_SIZE = 16



--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -13,7 +13,7 @@ from transformers import __version__ as transformers_version

 from vllm.logger import init_logger

-from ..utils import compare_two_settings, fork_new_process_for_each_test
+from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix

 logger = init_logger("test_pipeline_parallel")

@@ -24,22 +24,22 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
    ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
     "MODEL_NAME, DIST_BACKEND"),
    [
-        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
-        (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
-        (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
+        (2, 2, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
+        (2, 2, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
+        (1, 3, 0, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
+        (1, 4, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
+        (1, 4, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
+        (1, 3, 0, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
+        (1, 4, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
+        (1, 4, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
+        (2, 2, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
+        (2, 2, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
        # NOTE: InternVL2 multi-node tests are flaky,
        # use mp backend to skip the multi-node tests
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
-        (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
-        (1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
-        (1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp")
+        (1, 2, 1, 1, 1, os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"), "mp"),
+        (1, 2, 1, 1, 1, os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"), "mp"),
+        (1, 2, 1, 0, 1, os.path.join(models_path_prefix, "OpenGVLab/InternVL2-4B"), "mp"),
+        (1, 2, 0, 1, 0, os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"), "mp")
    ],
 )
 @fork_new_process_for_each_test

--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -2,15 +2,15 @@ import os

 import pytest

-from ..utils import compare_two_settings, fork_new_process_for_each_test
+from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix


 @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
-    (2, "JackFram/llama-160m"),
+    (2, os.path.join(models_path_prefix, "JackFram/llama-160m")),
 ])
 @pytest.mark.parametrize("ATTN_BACKEND", [
    "FLASH_ATTN",
-    "FLASHINFER",
+    # "FLASHINFER",
 ])
 @fork_new_process_for_each_test
 def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):

--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -5,6 +5,7 @@ Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
 from typing import List, Optional, Tuple

 import pytest
+import os
 from transformers import AutoModelForSeq2SeqLM

 from vllm.sequence import SampleLogprobs
@@ -12,6 +13,7 @@ from vllm.utils import is_cpu

 from ..conftest import DecoderPromptType
 from ..models.utils import check_logprobs_close
+from ..utils import models_path_prefix


 def vllm_to_hf_output(
@@ -28,7 +30,7 @@ def vllm_to_hf_output(
    return output_ids, hf_output_str, out_logprobs


-@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/bart-large-cnn")])
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])

--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -3,9 +3,11 @@ import pytest
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
+import os
+from ..utils import models_path_prefix


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
    # This test checks if we are able to run the engine to completion

--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_custom_executor.py
@@ -8,6 +8,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
 from vllm.sampling_params import SamplingParams
+import os
+from ..utils import models_path_prefix


 class Mock:
@@ -31,7 +33,7 @@ class CustomGPUExecutorAsync(GPUExecutorAsync):
        return await super().execute_model_async(*args, **kwargs)


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
 def test_custom_executor_type_checking(model):
    with pytest.raises(ValueError):
        engine_args = EngineArgs(model=model,
@@ -47,7 +49,7 @@ def test_custom_executor_type_checking(model):
        AsyncLLMEngine.from_engine_args(engine_args)


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
 def test_custom_executor(model, tmpdir):
    cwd = os.path.abspath(".")
    os.chdir(tmpdir)
@@ -67,7 +69,7 @@ def test_custom_executor(model, tmpdir):
        os.chdir(cwd)


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
 def test_custom_executor_async(model, tmpdir):
    cwd = os.path.abspath(".")
    os.chdir(tmpdir)

--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -2,9 +2,11 @@ import pytest

 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
+import os
+from ..utils import models_path_prefix


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
 def test_computed_prefix_blocks(model: str):
    # This test checks if the engine generates completions both with and
    # without optional detokenization, that detokenization includes text

--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -2,9 +2,11 @@ import pytest

 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
+import os
+from ..utils import models_path_prefix


-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
 def test_skip_tokenizer_initialization(model: str):
    # This test checks if the flag skip_tokenizer_init skips the initialization
    # of tokenizer and detokenizer. The generated output is expected to contain

--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -7,11 +7,13 @@ Run `pytest tests/engine/test_stop_reason.py`.
 """

 import pytest
+import os
 import transformers

 from vllm import SamplingParams
+from ..utils import models_path_prefix

-MODEL = "facebook/opt-350m"
+MODEL = os.path.join(models_path_prefix, "facebook/opt-350m")
 STOP_STR = "."
 SEED = 42
 MAX_TOKENS = 1024

--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
 from typing import Any, List, Optional

 import pytest
+import os

 from vllm import CompletionOutput, LLMEngine, SamplingParams
+from ..utils import models_path_prefix

-MODEL = "meta-llama/llama-2-7b-hf"
+MODEL = os.path.join(models_path_prefix, "meta-llama/llama-2-7b-hf")
 MAX_TOKENS = 200

 IS_ASYNC = False

--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -2,12 +2,14 @@ import weakref
 from typing import List

 import pytest
+import os

 from vllm import LLM, EmbeddingRequestOutput, PoolingParams

 from ...conftest import cleanup
+from ...utils import models_path_prefix

-MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")

 PROMPTS = [
    "Hello, my name is",