add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...
add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path instead of Hugging Face Hub
3c9817d2 · zhuwenwen · 49204f68 · 3c9817d2 · 3c9817d2 · 3c9817d2
Commit 3c9817d2 authored Nov 27, 2024 by zhuwenwen
20 changed files
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -2,15 +2,16 @@ from typing import List, Optional, Tuple, Type

 import numpy as np
 import pytest
+import os
 from transformers import AutoModel, AutoTokenizer, BatchEncoding

 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

 from ....conftest import HfRunner, VllmRunner
-from ...utils import check_logprobs_close
+from ...utils import check_logprobs_close, models_path_prefix

-MODEL_NAME = "fixie-ai/ultravox-v0_3"
+MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3")

 AudioTuple = Tuple[np.ndarray, int]


--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
@@ -4,8 +4,10 @@ Run `pytest tests/models/test_aqlm.py`.
 """

 import pytest
+import os

 from tests.quantization.utils import is_quant_method_supported
+from ...utils import models_path_prefix

 # These ground truth generations were generated using `transformers==4.38.1
 # aqlm==1.1.0 torch==2.2.0`
@@ -40,7 +42,7 @@ ground_truth_generations = [

 @pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                    reason="AQLM is not supported on this GPU type.")
-@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf")])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [16])
 @pytest.mark.parametrize("num_logprobs", [1])

--- a/tests/models/decoder_only/language/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
@@ -5,24 +5,26 @@ This tests bigger models and use half precision.
 Run `pytest tests/models/test_big_models.py`.
 """
 import pytest
+import os

 from vllm.platforms import current_platform

 from ...utils import check_outputs_equal
+from ....utils import models_path_prefix

 MODELS = [
-    "meta-llama/Llama-2-7b-hf",
+    os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
    # "mistralai/Mistral-7B-v0.1",  # Tested by test_mistral.py
    # "Deci/DeciLM-7b",  # Broken
    # "tiiuae/falcon-7b",  # Broken
-    "EleutherAI/gpt-j-6b",
+    os.path.join(models_path_prefix, "EleutherAI/gpt-j-6b"),
    # "mosaicml/mpt-7b",  # Broken
    # "Qwen/Qwen1.5-0.5B"  # Broken,
 ]

 if not current_platform.is_cpu():
    # MiniCPM requires fused_moe which is not supported by CPU
-    MODELS.append("openbmb/MiniCPM3-4B")
+    MODELS.append(os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"))

 #TODO: remove this after CPU float16 support ready
 target_dtype = "float" if current_platform.is_cpu() else "half"

--- a/tests/models/decoder_only/language/test_danube3_4b.py
+++ b/tests/models/decoder_only/language/test_danube3_4b.py
@@ -5,10 +5,12 @@ This tests danube3 separately because its head size isn't supported on CPU yet.
 Run `pytest tests/models/test_danube3_4b.py`.
 """
 import pytest
+import os

 from ...utils import check_outputs_equal
+from ....utils import models_path_prefix

-MODELS = ["h2oai/h2o-danube3-4b-base"]
+MODELS = [os.path.join(models_path_prefix, "h2oai/h2o-danube3-4b-base")]

 target_dtype = "half"


--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -11,6 +11,7 @@ from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported

 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

@@ -21,14 +22,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
    "kv_cache_dtype,base_model,test_model,scale_path",
    [
        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+        ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
+         os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV"), None),
        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+        ("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
+         os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"), None),
        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
-         "meta-llama/Llama-2-7b-chat-hf",
+        ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
+         os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
         "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens

--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -12,6 +12,7 @@ from transformers import AutoTokenizer
 from tests.quantization.utils import is_quant_method_supported

 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

@@ -19,16 +20,16 @@ MAX_MODEL_LEN = 1024

 # FIXME: Move this to confest
 MODELS = [
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    (os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"),
     hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
                     filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    (os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"),
     hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
                     filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
-    ("Qwen/Qwen2-1.5B-Instruct",
+    (os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"),
     hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
                     filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
-    ("Qwen/Qwen2-1.5B-Instruct",
+    (os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"),
     hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
                     filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
 ]

--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -16,6 +16,7 @@ from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT

 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

@@ -23,26 +24,26 @@ MAX_MODEL_LEN = 1024

 MODELS = [
    # act_order==False, group_size=channelwise
-    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
+    (os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"), "main"),
    # act_order==False, group_size=128
    ("TheBloke/Llama-2-7B-GPTQ", "main"),

    # act_order==True, group_size=128
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
+    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "main"),
    # act_order==True, group_size=64
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
+    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-64g-actorder_True"),
    # act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
+    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-32g-actorder_True"),

    # 8-bit, act_order==True, group_size=channelwise
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
+    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit--1g-actorder_True"),
    # 8-bit, act_order==True, group_size=128
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
+    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-128g-actorder_True"),
    # 8-bit, act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
+    (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-32g-actorder_True"),

    # 4-bit, act_order==True, group_size=128
-    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
+    (os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"), "main")
 ]



--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -9,10 +9,12 @@ Run `pytest tests/models/test_marlin_24.py`.
 from dataclasses import dataclass

 import pytest
+import os

 from tests.quantization.utils import is_quant_method_supported

 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix


 @dataclass
@@ -23,18 +25,18 @@ class ModelPair:

 model_pairs = [
    # 4-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
+    ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"),
+              model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-4bit-g128")),
    # 4-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
+    ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-channelwise"),
+              model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-4bit-channelwise")),

    # 8-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
+    ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-8bit-g128"),
+              model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-8bit-g128")),
    # 8-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
+    ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-8bit-channelwise"),
+              model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-8bit-channelwise")),
 ]



--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -3,12 +3,14 @@
 Run `pytest tests/models/test_granite.py`.
 """
 import pytest
+import os
 import transformers

 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix

 MODELS = [
-    "ibm/PowerLM-3b",
+    os.path.join(models_path_prefix, "ibm/PowerLM-3b"),
 ]



--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
 import pytest
+import os

 from vllm.worker.model_runner import _get_graph_batch_size

 from ...utils import check_outputs_equal
+from ....utils import models_path_prefix

-MODELS = ["ai21labs/Jamba-tiny-random"]
+MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]


 # Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl

--- a/tests/models/decoder_only/language/test_marlin.py
+++ b/tests/models/decoder_only/language/test_marlin.py
@@ -13,10 +13,12 @@ Run `pytest tests/models/test_marlin.py`.
 from dataclasses import dataclass

 import pytest
+import os

 from tests.quantization.utils import is_quant_method_supported

 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix


 @dataclass
@@ -26,12 +28,12 @@ class ModelPair:


 model_pairs = [
-    ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
-              model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
-    ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
-              model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
-    ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
-              model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
+    ModelPair(model_marlin=os.path.join(models_path_prefix, "nm-testing/zephyr-beta-7b-marlin-g128"),
+              model_gptq=os.path.join(models_path_prefix, "nm-testing/zephyr-beta-7b-gptq-g128")),
+    ModelPair(model_marlin=os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-marlin"),
+              model_gptq=os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq")),
+    ModelPair(model_marlin=os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"),
+              model_gptq=os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq"))
 ]



--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -3,14 +3,16 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import pytest
+import os

 from vllm import LLM, SamplingParams

 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix

 MODELS = [
-    "mistralai/Mistral-7B-Instruct-v0.1",
-    "mistralai/Mistral-7B-Instruct-v0.3",
+    os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1"),
+    os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
    # Mistral-Nemo is to big for CI, but passes locally
    # "mistralai/Mistral-Nemo-Instruct-2407"
 ]

--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -10,12 +10,13 @@ from transformers import AutoTokenizer

 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
+from ....utils import models_path_prefix

 os.environ["TOKENIZERS_PARALLELISM"] = "true"

 MAX_MODEL_LEN = 1024

-MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
+MODELS = [os.path.join(models_path_prefix, "nvidia/Llama-3.1-8B-Instruct-FP8")]

 EXPECTED_STRS_MAP = {
    "nvidia/Llama-3.1-8B-Instruct-FP8": [

--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -6,20 +6,21 @@ test_big_models.py because it could use a larger instance to run tests.
 Run `pytest tests/models/test_models.py`.
 """
 import pytest
+import os

-from ...utils import check_outputs_equal
+from ...utils import check_outputs_equal, models_path_prefix

 MODELS = [
-    "facebook/opt-125m",
-    "gpt2",
-    "bigcode/tiny_starcoder_py",
-    "EleutherAI/pythia-70m",
-    "bigscience/bloom-560m",  # Testing alibi slopes.
-    "microsoft/phi-2",
-    "stabilityai/stablelm-3b-4e1t",
+    os.path.join(models_path_prefix, "facebook/opt-125m"),
+    os.path.join(models_path_prefix, "gpt2"),
+    os.path.join(models_path_prefix, "bigcode/tiny_starcoder_py"),
+    os.path.join(models_path_prefix, "EleutherAI/pythia-70m"),
+    os.path.join(models_path_prefix, "bigscience/bloom-560m"),  # Testing alibi slopes.
+    os.path.join(models_path_prefix, "microsoft/phi-2"),
+    os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"),
    # "allenai/OLMo-1B",  # Broken
-    "bigcode/starcoder2-3b",
-    "google/gemma-1.1-2b-it",
+    os.path.join(models_path_prefix, "bigcode/starcoder2-3b"),
+    os.path.join(models_path_prefix, "google/gemma-1.1-2b-it"),
 ]



--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -4,13 +4,15 @@ Run `pytest tests/models/test_phimoe.py`.
 """
 import pytest
 import torch
+import os

 from vllm.utils import is_cpu

 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix

 MODELS = [
-    "microsoft/Phi-3.5-MoE-instruct",
+    os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"),
 ]



--- a/tests/models/decoder_only/vision_language/test_blip2.py
+++ b/tests/models/decoder_only/vision_language/test_blip2.py
 from typing import List, Optional, Tuple

 import pytest
+import os
 from transformers import AutoModelForVision2Seq, AutoTokenizer

 from vllm.multimodal.utils import rescale_image_size
@@ -8,6 +9,7 @@ from vllm.sequence import SampleLogprobs

 from ....conftest import IMAGE_ASSETS
 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -33,7 +35,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
    return hf_output_ids, hf_output_str, out_logprobs


-@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"])
+@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")])
 @pytest.mark.parametrize(
    "size_factors",
    [

--- a/tests/models/decoder_only/vision_language/test_broadcast.py
+++ b/tests/models/decoder_only/vision_language/test_broadcast.py
 import pytest
+import os

 from ....utils import multi_gpu_test
+from ....utils import models_path_prefix


 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
 @pytest.mark.parametrize("model", [
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-v1.6-mistral-7b-hf",
-    "facebook/chameleon-7b",
+    os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
+    os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"),
+    os.path.join(models_path_prefix, "facebook/chameleon-7b"),
 ])
 def test_models(hf_runner, vllm_runner, image_assets,
                distributed_executor_backend, model) -> None:

--- a/tests/models/decoder_only/vision_language/test_chameleon.py
+++ b/tests/models/decoder_only/vision_language/test_chameleon.py
 from typing import List, Optional, Type

 import pytest
+import os
 from transformers import AutoModelForVision2Seq, BatchEncoding

 from vllm.multimodal.utils import rescale_image_size
@@ -8,6 +9,7 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE

 from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from ...utils import check_outputs_equal
+from ....utils import models_path_prefix

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "USER: <image>\nWhat is the season?\nASSISTANT:",
 })

-models = ["facebook/chameleon-7b"]
+models = [os.path.join(models_path_prefix, "facebook/chameleon-7b")]


 def run_test(

--- a/tests/models/decoder_only/vision_language/test_fuyu.py
+++ b/tests/models/decoder_only/vision_language/test_fuyu.py
 from typing import List, Optional, Tuple, Type

 import pytest
+import os

 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -8,6 +9,7 @@ from vllm.utils import is_cpu

 from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "What is the season?\n",
 })

-models = ["adept/fuyu-8b"]
+models = [os.path.join(models_path_prefix, "adept/fuyu-8b")]


 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,

--- a/tests/models/decoder_only/vision_language/test_glm4.py
+++ b/tests/models/decoder_only/vision_language/test_glm4.py
 from typing import List, Optional, Tuple, Type

 import pytest
+import os

 from vllm.multimodal.utils import rescale_image_size
 from vllm.transformers_utils.tokenizer import patch_padding_side
@@ -8,6 +9,7 @@ from vllm.transformers_utils.tokenizer import patch_padding_side
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
+from ....utils import models_path_prefix

 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "stop_sign":
@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "What is the season?",
 })

-models = ["THUDM/glm-4v-9b"]
+models = [os.path.join(models_path_prefix, "THUDM/glm-4v-9b")]
 target_dtype = "bfloat16"