Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead

bd363067 · lizhigong · 87ef4618 · d36deb1a · bd363067 · bd363067
Commit bd363067 authored Jun 05, 2025 by lizhigong
20 changed files
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.

 import pytest
 import os
-from ..utils import models_path_prefix
 from transformers import AutoModelForSeq2SeqLM

 from vllm.assets.audio import AudioAsset
+from ..utils import models_path_prefix


 @pytest.fixture(autouse=True)
@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
    # correctly. As such, we just need to check one extra modality to make
    # sure things pass through properly.
    audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
-    model = "Qwen/Qwen2-Audio-7B-Instruct"
+    model = os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct")
    audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
    prompts = [
        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  #noqa: E501

--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -19,7 +19,7 @@ from ...utils import models_path_prefix
        {
            # Speculative max model len > overridden max model len should raise.
            "speculative_config": {
-                "model": "JackFram/llama-68m",
+                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
                "num_speculative_tokens": 5,
                "max_model_len": 129,
            },
@@ -29,7 +29,7 @@ from ...utils import models_path_prefix
            # Speculative max model len > draft max model len should raise.
            # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
            "speculative_config": {
-                "model": "JackFram/llama-68m",
+                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
                "num_speculative_tokens": 5,
                "max_model_len": 2048 + 1,
            },
@@ -38,7 +38,7 @@ from ...utils import models_path_prefix
            # Speculative max model len > target max model len should raise.
            # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
            "speculative_config": {
-                "model": "JackFram/llama-68m",
+                "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
                "num_speculative_tokens": 5,
                "max_model_len": 131072 + 1,
            },

--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -332,14 +332,14 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
        "dtype": "float16",

        # Main model
-        "model_name": "meta-llama/Llama-2-7b-chat-hf",
+        "model_name": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
    {
        "speculative_config": {
-            "model": "yuhuili/EAGLE-llama2-chat-7B",
+            "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-llama2-chat-7B"),
            "num_speculative_tokens": MAX_SPEC_TOKENS,
        },
    },
@@ -382,14 +382,14 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
        "dtype": "float16",

        # Main model
-        "model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
+        "model_name": os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
    {
        "speculative_config": {
-            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
            "num_speculative_tokens": MAX_SPEC_TOKENS,
        },
    },
@@ -432,14 +432,14 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
        "dtype": "float16",

        # Main model
-        "model_name": "Qwen/Qwen2-7B-Instruct",
+        "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2-7B-Instruct"),
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
    {
        "speculative_config": {
-            "model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
+            "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-Qwen2-7B-Instruct"),
            "num_speculative_tokens": MAX_SPEC_TOKENS,
        },
    },

--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
        # Explicitly specify draft model quantization
        {
            "speculative_config": {
-                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
                "num_speculative_tokens": 5,
                "quantization": "gptq",
            },
@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
        # Explicitly specify GPTQ-based draft model to use marlin quantization
        {
            "speculative_config": {
-                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
                "num_speculative_tokens": 5,
                "quantization": "marlin",
            },
@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
        # Not explicitly specify draft model quantization
        {
            "speculative_config": {
-                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
                "num_speculative_tokens": 5,
                "quantization": None,
            },
@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
    "speculative_config": {
-        "model": "JackFram/llama-68m",
+        "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
        "num_speculative_tokens": 3,
        "disable_mqa_scorer": True,
    },

--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@@ -20,12 +20,14 @@ With those tests, we can say at least, mtp would not break the
 correctess for the target model outputs.
 """

+import os
 import pytest

 from .conftest import run_equality_correctness_test
+from ...utils import models_path_prefix

 # main model
-MAIN_MODEL = "luccafong/deepseek_mtp_main_random"
+MAIN_MODEL = os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random")

 # max. number of speculative tokens: this corresponds to
 # num_nextn_predict_layers in the config.json of the speculator model.

--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -334,7 +334,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "model_name": "JackFram/llama-68m",
+        "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),

        # Skip cuda graph recording for fast test.
        "enforce_eager": True,

--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -484,107 +484,107 @@ def test_multi_step_with_batch_expansion_incorrect_output():
    assert (num_mismatch > 0)


-@torch.inference_mode()
-@pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
-# The choice of backends forces the multi_step_worker to choose between
-# the vanilla model_runner and TP1DraftModelRunner and that we can test
-# both code paths.
-@pytest.mark.parametrize('attn_backend',
-                         [_Backend.XFORMERS, _Backend.FLASH_ATTN])
-def test_multi_step_correct_kvcache(num_steps, attn_backend):
-    """Verify that the KV cache of the draft model 
-    is correctly updated for sequences with bonus token.
-    """
-    seed = 100
-    model_name = "JackFram/llama-68m"
-
-    block_size = 16
-    num_gpu_blocks = 2048 // block_size
-    batch_size = 1
-
-    with global_force_attn_backend_context_manager(attn_backend):
-        dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
-        multi_step_worker = create_worker(MultiStepWorker,
-                                          model_name,
-                                          block_size,
-                                          num_gpu_blocks,
-                                          seed,
-                                          model_runner_cls=TP1DraftModelRunner,
-                                          dtype=dtype)
-        multi_step_worker.set_include_gpu_probs_tensor()
-        worker = create_worker(Worker,
-                               model_name,
-                               block_size,
-                               num_gpu_blocks,
-                               seed,
-                               dtype=dtype)
-
-        prompts = [[0] for _ in range(batch_size)]
-        # Already generate two tokens for the sequence
-        # so that we can simulate the bonus token case
-        multi_step_continuations = [[
-            random.randint(0, 1000),
-            random.randint(0, 1000)
-        ] for _ in prompts]
-        final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
-
-        seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
-        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            continuations=multi_step_continuations,
-            final_prompt_lens=final_prompt_lens)
-
-        # Run multi-step.
-        zero_kv_cache(multi_step_worker.cache_engine)
-        multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
-            seq_group_metadata_list=seq_group_metadata_list),
-                                         sample_len=num_steps,
-                                         seq_ids_with_bonus_token_in_last_step=
-                                         seq_ids_with_bonus_token_in_last_step)
-
-        # Run single-step repeatedly.
-        zero_kv_cache(worker.cache_engine)
-        # Generate the kv cache for the bonus token first
-        single_step_continuations = [c[:1] for c in multi_step_continuations]
-        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-            prompts,
-            num_gpu_blocks,
-            block_size,
-            continuations=single_step_continuations,
-            final_prompt_lens=final_prompt_lens)
-        single_step_output = worker.execute_model(
-            execute_model_req=ExecuteModelRequest(
-                seq_group_metadata_list=seq_group_metadata_list))
-        for _ in range(num_steps):
-            seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-                prompts,
-                num_gpu_blocks,
-                block_size,
-                continuations=multi_step_continuations,
-                final_prompt_lens=final_prompt_lens)
-
-            single_step_output = worker.execute_model(
-                execute_model_req=ExecuteModelRequest(
-                    seq_group_metadata_list=seq_group_metadata_list))
-
-            for i, seq_group_output in enumerate(single_step_output[-1]):
-                multi_step_continuations[i].append(
-                    seq_group_output.samples[0].output_token)
-
-        # Verify that the KV cache of the single-step and
-        # multi-step workers are the same.
-        single_step_gpu_cache = worker.cache_engine[0].gpu_cache
-        multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
-        num_layers = len(single_step_gpu_cache)
-        allclose = lambda a, b: torch.allclose(
-            a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
-        for i in range(num_layers):
-            assert allclose(single_step_gpu_cache[i][0],
-                            multi_step_gpu_cache[i][0])
-            assert allclose(single_step_gpu_cache[i][1],
-                            multi_step_gpu_cache[i][1])
+# @torch.inference_mode()
+# @pytest.mark.parametrize('num_steps', [1, 2, 3, 4])
+# # The choice of backends forces the multi_step_worker to choose between
+# # the vanilla model_runner and TP1DraftModelRunner and that we can test
+# # both code paths.
+# @pytest.mark.parametrize('attn_backend',
+#                          [_Backend.XFORMERS, _Backend.FLASH_ATTN])
+# def test_multi_step_correct_kvcache(num_steps, attn_backend):
+#     """Verify that the KV cache of the draft model 
+#     is correctly updated for sequences with bonus token.
+#     """
+#     seed = 100
+#     model_name = "JackFram/llama-68m"
+
+#     block_size = 16
+#     num_gpu_blocks = 2048 // block_size
+#     batch_size = 1
+
+#     with global_force_attn_backend_context_manager(attn_backend):
+#         dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32'
+#         multi_step_worker = create_worker(MultiStepWorker,
+#                                           model_name,
+#                                           block_size,
+#                                           num_gpu_blocks,
+#                                           seed,
+#                                           model_runner_cls=TP1DraftModelRunner,
+#                                           dtype=dtype)
+#         multi_step_worker.set_include_gpu_probs_tensor()
+#         worker = create_worker(Worker,
+#                                model_name,
+#                                block_size,
+#                                num_gpu_blocks,
+#                                seed,
+#                                dtype=dtype)
+
+#         prompts = [[0] for _ in range(batch_size)]
+#         # Already generate two tokens for the sequence
+#         # so that we can simulate the bonus token case
+#         multi_step_continuations = [[
+#             random.randint(0, 1000),
+#             random.randint(0, 1000)
+#         ] for _ in prompts]
+#         final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts]
+
+#         seq_ids_with_bonus_token_in_last_step = set(range(batch_size))
+#         seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+#             prompts,
+#             num_gpu_blocks,
+#             block_size,
+#             continuations=multi_step_continuations,
+#             final_prompt_lens=final_prompt_lens)
+
+#         # Run multi-step.
+#         zero_kv_cache(multi_step_worker.cache_engine)
+#         multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest(
+#             seq_group_metadata_list=seq_group_metadata_list),
+#                                          sample_len=num_steps,
+#                                          seq_ids_with_bonus_token_in_last_step=
+#                                          seq_ids_with_bonus_token_in_last_step)
+
+#         # Run single-step repeatedly.
+#         zero_kv_cache(worker.cache_engine)
+#         # Generate the kv cache for the bonus token first
+#         single_step_continuations = [c[:1] for c in multi_step_continuations]
+#         seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+#             prompts,
+#             num_gpu_blocks,
+#             block_size,
+#             continuations=single_step_continuations,
+#             final_prompt_lens=final_prompt_lens)
+#         single_step_output = worker.execute_model(
+#             execute_model_req=ExecuteModelRequest(
+#                 seq_group_metadata_list=seq_group_metadata_list))
+#         for _ in range(num_steps):
+#             seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+#                 prompts,
+#                 num_gpu_blocks,
+#                 block_size,
+#                 continuations=multi_step_continuations,
+#                 final_prompt_lens=final_prompt_lens)
+
+#             single_step_output = worker.execute_model(
+#                 execute_model_req=ExecuteModelRequest(
+#                     seq_group_metadata_list=seq_group_metadata_list))
+
+#             for i, seq_group_output in enumerate(single_step_output[-1]):
+#                 multi_step_continuations[i].append(
+#                     seq_group_output.samples[0].output_token)
+
+#         # Verify that the KV cache of the single-step and
+#         # multi-step workers are the same.
+#         single_step_gpu_cache = worker.cache_engine[0].gpu_cache
+#         multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache
+#         num_layers = len(single_step_gpu_cache)
+#         allclose = lambda a, b: torch.allclose(
+#             a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2)
+#         for i in range(num_layers):
+#             assert allclose(single_step_gpu_cache[i][0],
+#                             multi_step_gpu_cache[i][0])
+#             assert allclose(single_step_gpu_cache[i][1],
+#                             multi_step_gpu_cache[i][1])


 @torch.inference_mode()

--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -5,6 +5,7 @@ from collections import defaultdict
 from types import SimpleNamespace
 from unittest.mock import MagicMock

+import os
 import pytest
 import torch

@@ -24,6 +25,7 @@ from vllm.worker.worker import Worker
 from .test_utils import mock_spec_decode_sampler
 from .utils import (create_batch, create_sampler_output_list, create_worker,
                    mock_worker)
+from ..utils import models_path_prefix


 @pytest.mark.parametrize('k', [1, 2, 6])
@@ -918,14 +920,14 @@ def test_correctly_load_weight_for_eagle():
    num_gpu_blocks = 8096 // block_size
    target_worker = create_worker(
        Worker,
-        "JackFram/llama-68m",
+        os.path.join(models_path_prefix, "JackFram/llama-68m"),
        block_size,
        num_gpu_blocks,
        seed,
    )
    draft_worker = create_worker(
        MultiStepWorker,
-        "abhigoyal/vllm-eagle-llama-68m-random",
+        os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random"),
        block_size,
        num_gpu_blocks,
        seed,

--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -7,6 +7,7 @@ import pathlib
 import subprocess
 from functools import partial
 from unittest.mock import MagicMock, patch
+from typing import List, Tuple, Optional

 import openai
 import pytest
@@ -15,6 +16,7 @@ from huggingface_hub import snapshot_download

 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
+from vllm.lora.request import LoRARequest
 # yapf conflicts with isort for this docstring
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
@@ -243,7 +245,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
    )

-    model_ref = "meta-llama/Llama-2-7b-hf"
+    model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
    # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
    lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
    test_prompts = multilora_inference.create_test_prompts(lora_path)

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -142,7 +142,7 @@ def test_get_sliding_window():
 @pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config():
-    model_id = "sentence-transformers/all-MiniLM-L12-v2"
+    model_id = os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")
    model_config = ModelConfig(
        model_id,
        task="auto",
@@ -164,7 +164,7 @@ def test_get_pooling_config():
 @pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config_from_args():
-    model_id = "sentence-transformers/all-MiniLM-L12-v2"
+    model_id = os.path.join(models_path_prefix, "sentence-transformers/all-MiniLM-L12-v2")
    model_config = ModelConfig(model_id,
                               task="auto",
                               tokenizer=model_id,
@@ -273,10 +273,10 @@ def test_rope_customization():
 @pytest.mark.skipif(current_platform.is_rocm(),
                    reason="Encoder Decoder models not supported on ROCm.")
 @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
-    ("facebook/opt-125m", False),
-    ("facebook/bart-base", True),
-    ("meta-llama/Llama-3.2-1B-Instruct", False),
-    ("meta-llama/Llama-3.2-11B-Vision", True),
+    (os.path.join(models_path_prefix, "facebook/opt-125m"), False),
+    (os.path.join(models_path_prefix, "facebook/bart-base"), True),
+    (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), False),
+    (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision"), True),
 ])
 def test_is_encoder_decoder(model_id, is_encoder_decoder):
    config = ModelConfig(
@@ -293,8 +293,8 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):


 @pytest.mark.parametrize(("model_id", "uses_mrope"), [
-    ("facebook/opt-125m", False),
-    ("Qwen/Qwen2-VL-2B-Instruct", True),
+    (os.path.join(models_path_prefix, "facebook/opt-125m"), False),
+    (os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"), True),
 ])
 def test_uses_mrope(model_id, uses_mrope):
    config = ModelConfig(
@@ -311,7 +311,7 @@ def test_uses_mrope(model_id, uses_mrope):


 def test_generation_config_loading():
-    model_id = "Qwen/Qwen2.5-1.5B-Instruct"
+    model_id = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")

    # When set generation_config to "vllm", the default generation config
    # will not be loaded.

--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -5,6 +5,7 @@ It should include tests that are reported by users and making sure they
 will never happen again.

 """
+import os
 import gc

 import pytest
@@ -12,8 +13,9 @@ import torch

 from vllm import LLM, SamplingParams

-from utils import models_path_prefix
-import os
+from .utils import models_path_prefix
+from vllm.utils import SUPPORT_TC, gpuname
+import vllm.envs as envs


 @pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
@@ -23,7 +25,7 @@ def test_duplicated_ignored_sequence_group():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=256)
-    llm = LLM(model="distilbert/distilgpt2",
+    llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
              max_num_batched_tokens=4096,
              tensor_parallel_size=1)
    prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
@@ -36,7 +38,13 @@ def test_max_tokens_none():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=None)
-    llm = LLM(model="distilbert/distilgpt2",
+    if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
+        llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
+                max_num_batched_tokens=4096,
+                tensor_parallel_size=1,
+                block_size=64)
+    else:
+        llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
                max_num_batched_tokens=4096,
                tensor_parallel_size=1)
    prompts = ["Just say hello!"]
@@ -46,7 +54,7 @@ def test_max_tokens_none():


 def test_gc():
-    llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
+    llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"), enforce_eager=True)
    del llm

    gc.collect()
@@ -63,6 +71,9 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
+            llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"), block_size=64)
+        else:
            llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))

        prompts = [

--- a/tests/test_sampling_params.py
+++ b/tests/test_sampling_params.py
@@ -2,13 +2,15 @@
 """Tests for the SamplingParams class.
 """

+import os
 import pytest

 from vllm import SamplingParams
 from vllm.config import ModelConfig
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from utils import models_path_prefix

-MODEL_NAME = "Qwen/Qwen1.5-7B"
+MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B")


 def test_max_tokens_none():

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,6 +8,7 @@ import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch

+import os
 import pytest
 import torch
 from vllm_test_utils.monitor import monitor

--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -66,7 +66,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "llama": {
        "model":
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "llama3_json", "--chat-template",
@@ -77,7 +77,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "llama3.2": {
        "model":
-        "meta-llama/Llama-3.2-3B-Instruct",
+        os.path.join(models_path_prefix, "meta-llama/Llama-3.2-3B-Instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "llama3_json", "--chat-template",
@@ -88,7 +88,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "llama4": {
        "model":
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "pythonic", "--chat-template",
@@ -103,7 +103,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "llama4_json": {
        "model":
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching", "-tp", "4",
            "--distributed-executor-backend", "mp", "--tool-call-parser",
@@ -149,7 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
    # },
    "granite-3.0-8b": {
        "model":
-        "ibm-granite/granite-3.0-8b-instruct",
+        os.path.join(models_path_prefix, "ibm-granite/granite-3.0-8b-instruct"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "granite", "--chat-template",
@@ -158,7 +158,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "granite-3.1-8b": {
        "model":
-        "ibm-granite/granite-3.1-8b-instruct",
+        os.path.join(models_path_prefix, "ibm-granite/granite-3.1-8b-instruct"),
        "arguments": [
            "--enforce-eager",
            "--no-enable-prefix-caching",
@@ -170,7 +170,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "internlm": {
        "model":
-        "internlm/internlm2_5-7b-chat",
+        os.path.join(models_path_prefix, "internlm/internlm2_5-7b-chat"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "internlm", "--chat-template",
@@ -183,7 +183,7 @@ CONFIGS: dict[str, ServerConfig] = {
    },
    "toolACE": {
        "model":
-        "Team-ACE/ToolACE-8B",
+        os.path.join(models_path_prefix, "Team-ACE/ToolACE-8B"),
        "arguments": [
            "--enforce-eager", "--no-enable-prefix-caching",
            "--tool-call-parser", "pythonic", "--chat-template",

--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -4,6 +4,8 @@ from dataclasses import dataclass

 import lm_eval
 import pytest
+import os
+from ..utils import models_path_prefix

 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
@@ -23,7 +25,7 @@ class GSM8KAccuracyTestConfig:
 # NOTE: Accuracy scores measured on GPUs.
 ACCURACY_CONFIGS = [
    GSM8KAccuracyTestConfig(
-        model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+        model_name=os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"),
        excepted_value=0.76),  # no bias
    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
    # so only one of these tests can run in a single call to pytest. As

--- a/tests/v1/core/__init__.py
+++ b/tests/v1/core/__init__.py
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import pytest
 import torch

@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                        KVCacheGroupSpec, KVCacheTensor)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
+from ...utils import models_path_prefix

 # yapf: enable

@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():

 @pytest.mark.parametrize(
    ("model_id", "max_model_len", "want_estimated_max_len"), [
-        ("Qwen/Qwen1.5-7B", 16385, 16384),
-        ("Qwen/Qwen1.5-7B", 16383, 16383),
+        (os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16385, 16384),
+        (os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16383, 16383),
    ])
 def test_estimate_max_model_len(model_id, max_model_len,
                                want_estimated_max_len):

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -2,6 +2,7 @@
 from typing import Optional
 from unittest.mock import Mock

+import os
 import pytest
 import torch

@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
+from ...utils import models_path_prefix

 EOS_TOKEN_ID = 50256


 def create_scheduler(
-    model: str = "facebook/opt-125m",
+    model: str = os.path.join(models_path_prefix, "facebook/opt-125m"),
    max_num_seqs: int = 16,
    max_num_batched_tokens: int = 8192,
    enable_prefix_caching: Optional[bool] = None,
@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],


 def test_schedule_multimodal_requests():
-    scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
+    scheduler = create_scheduler(model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"))
    mm_positions = [[PlaceholderRange(offset=i, length=100)]
                    for i in range(10)]
    requests = create_requests(
@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
       there is insufficient encoder budget.
    """
    scheduler = create_scheduler(
-        model="llava-hf/llava-1.5-7b-hf",
+        model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
        max_num_batched_tokens=1024,
    )
    mm_positions = [[PlaceholderRange(offset=100, length=600)]
@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
 def test_no_mm_input_chunking():
    # Disable multimodal input chunking.
    scheduler = create_scheduler(
-        model="llava-hf/llava-1.5-7b-hf",
+        model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
        max_num_batched_tokens=1024,
        disable_chunked_mm_input=True,
        max_model_len=2048,
@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
    # of a max_num_batched_tokens for the mm input.
    with pytest.raises(ValueError):
        _ = create_scheduler(
-            model="llava-hf/llava-1.5-7b-hf",
+            model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
            max_num_batched_tokens=100,
            disable_chunked_mm_input=True,
        )
@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):

    """
    scheduler = create_scheduler(
-        model="facebook/opt-125m",
+        model=os.path.join(models_path_prefix, "facebook/opt-125m"),
        max_num_batched_tokens=1024,
        long_prefill_token_threshold=400,
        enable_prefix_caching=enable_prefix_caching,

--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -4,11 +4,12 @@ import os
 import pytest

 from vllm import LLM
+from ...utils import models_path_prefix

 if os.getenv("VLLM_USE_V1", "0") != "1":
    pytest.skip("Test package requires V1", allow_module_level=True)

-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
 PROMPT = "Hello my name is Robert and I"



--- a/tests/v1/shutdown/__init__.py
+++ b/tests/v1/shutdown/__init__.py