[tests] update v1 tests

2e3bfb1e · zhuwenwen · 87d06573 · 2e3bfb1e · 2e3bfb1e · 2e3bfb1e
Commit 2e3bfb1e authored Jun 04, 2025 by zhuwenwen
17 changed files
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -6,10 +6,10 @@ Run `pytest tests/samplers/test_beam_search.py`.

 import pytest
 import os
-from ..utils import models_path_prefix
 from transformers import AutoModelForSeq2SeqLM

 from vllm.assets.audio import AudioAsset
+from ..utils import models_path_prefix


 @pytest.fixture(autouse=True)
@@ -83,7 +83,7 @@ def test_beam_search_passes_multimodal_data(
    # correctly. As such, we just need to check one extra modality to make
    # sure things pass through properly.
    audios = [AudioAsset("mary_had_lamb").audio_and_sample_rate]
-    model = "Qwen/Qwen2-Audio-7B-Instruct"
+    model = os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct")
    audio_seq = "<|audio_bos|><|AUDIO|><|audio_eos|>"
    prompts = [
        f"<|im_start|>user\n{audio_seq}Can you transcribe this?<|im_end|>\n<|im_start|>assistant\n"  #noqa: E501
@@ -140,4 +140,4 @@ def test_beam_search_passes_multimodal_data(
                assert filtered_hf_output_ids[-1] == eos_token_id
                filtered_hf_output_ids = filtered_hf_output_ids[:-1]

-            assert filtered_hf_output_ids == filtered_vllm_output_ids
+            assert filtered_hf_output_ids == filtered_vllm_output_ids
\ No newline at end of file
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -69,7 +69,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
        # Explicitly specify draft model quantization
        {
            "speculative_config": {
-                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
                "num_speculative_tokens": 5,
                "quantization": "gptq",
            },
@@ -77,7 +77,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
        # Explicitly specify GPTQ-based draft model to use marlin quantization
        {
            "speculative_config": {
-                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
                "num_speculative_tokens": 5,
                "quantization": "marlin",
            },
@@ -85,7 +85,7 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
        # Not explicitly specify draft model quantization
        {
            "speculative_config": {
-                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "model": os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
                "num_speculative_tokens": 5,
                "quantization": None,
            },
@@ -124,7 +124,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [{
    "speculative_config": {
-        "model": "JackFram/llama-68m",
+        "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
        "num_speculative_tokens": 3,
        "disable_mqa_scorer": True,
    },
@@ -151,4 +151,4 @@ def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                                  batch_size,
                                  max_output_len=output_len,
                                  seed=seed,
-                                  temperature=0.0)
+                                  temperature=0.0)
\ No newline at end of file
--- a/tests/v1/core/__init__.py
+++ b/tests/v1/core/__init__.py
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import pytest
 import torch

@@ -22,6 +23,7 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                        KVCacheGroupSpec, KVCacheTensor)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
+from ...utils import models_path_prefix

 # yapf: enable

@@ -432,8 +434,8 @@ def test_unify_kv_cache_configs():

 @pytest.mark.parametrize(
    ("model_id", "max_model_len", "want_estimated_max_len"), [
-        ("Qwen/Qwen1.5-7B", 16385, 16384),
-        ("Qwen/Qwen1.5-7B", 16383, 16383),
+        (os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16385, 16384),
+        (os.path.join(models_path_prefix, "Qwen/Qwen1.5-7B"), 16383, 16383),
    ])
 def test_estimate_max_model_len(model_id, max_model_len,
                                want_estimated_max_len):

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -2,6 +2,7 @@
 from typing import Optional
 from unittest.mock import Mock

+import os
 import pytest
 import torch

@@ -16,12 +17,13 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
+from ...utils import models_path_prefix

 EOS_TOKEN_ID = 50256


 def create_scheduler(
-    model: str = "facebook/opt-125m",
+    model: str = os.path.join(models_path_prefix, "facebook/opt-125m"),
    max_num_seqs: int = 16,
    max_num_batched_tokens: int = 8192,
    enable_prefix_caching: Optional[bool] = None,
@@ -211,7 +213,7 @@ def test_schedule(enable_prefix_caching: Optional[bool],


 def test_schedule_multimodal_requests():
-    scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
+    scheduler = create_scheduler(model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"))
    mm_positions = [[PlaceholderRange(offset=i, length=100)]
                    for i in range(10)]
    requests = create_requests(
@@ -243,7 +245,7 @@ def test_schedule_partial_requests():
       there is insufficient encoder budget.
    """
    scheduler = create_scheduler(
-        model="llava-hf/llava-1.5-7b-hf",
+        model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
        max_num_batched_tokens=1024,
    )
    mm_positions = [[PlaceholderRange(offset=100, length=600)]
@@ -303,7 +305,7 @@ def test_schedule_partial_requests():
 def test_no_mm_input_chunking():
    # Disable multimodal input chunking.
    scheduler = create_scheduler(
-        model="llava-hf/llava-1.5-7b-hf",
+        model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
        max_num_batched_tokens=1024,
        disable_chunked_mm_input=True,
        max_model_len=2048,
@@ -347,7 +349,7 @@ def test_no_mm_input_chunking():
    # of a max_num_batched_tokens for the mm input.
    with pytest.raises(ValueError):
        _ = create_scheduler(
-            model="llava-hf/llava-1.5-7b-hf",
+            model=os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
            max_num_batched_tokens=100,
            disable_chunked_mm_input=True,
        )
@@ -362,7 +364,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):

    """
    scheduler = create_scheduler(
-        model="facebook/opt-125m",
+        model=os.path.join(models_path_prefix, "facebook/opt-125m"),
        max_num_batched_tokens=1024,
        long_prefill_token_threshold=400,
        enable_prefix_caching=enable_prefix_caching,
@@ -1241,4 +1243,4 @@ def test_memory_leak():
        scheduler.update_from_output(scheduler_output, model_runner_output)

    # Confirm no memory leak.
-    assert_scheduler_empty(scheduler)
+    assert_scheduler_empty(scheduler)
\ No newline at end of file
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -4,11 +4,12 @@ import os
 import pytest

 from vllm import LLM
+from ...utils import models_path_prefix

 if os.getenv("VLLM_USE_V1", "0") != "1":
    pytest.skip("Test package requires V1", allow_module_level=True)

-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
 PROMPT = "Hello my name is Robert and I"


@@ -26,4 +27,4 @@ def test_concurrent_partial_prefill(model):
    outputs = model.generate([PROMPT] * 3)
    assert len(outputs) == 3
    for output in outputs:
-        assert len(output.outputs) == 1
+        assert len(output.outputs) == 1
\ No newline at end of file
--- a/tests/v1/shutdown/__init__.py
+++ b/tests/v1/shutdown/__init__.py
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
 # SPDX-License-Identifier: Apache-2.0
 """Test that we handle a startup Error and shutdown."""

+import os
 import pytest

 from tests.utils import wait_for_gpu_memory_to_clear
@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.sampling_params import RequestOutputKind
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
+from ...utils import models_path_prefix

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]


 @pytest.mark.asyncio
@@ -94,4 +96,4 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
        wait_for_gpu_memory_to_clear(
            devices=list(range(tensor_parallel_size)),
            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
-        )
+        )
\ No newline at end of file
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -3,6 +3,7 @@

 import asyncio

+import os
 import pytest

 from tests.utils import wait_for_gpu_memory_to_clear
@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError
+from ...utils import models_path_prefix

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]


 def evil_forward(self, *args, **kwargs):
@@ -126,4 +128,4 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
        wait_for_gpu_memory_to_clear(
            devices=list(range(tensor_parallel_size)),
            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
-        )
+        )
\ No newline at end of file
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -3,6 +3,7 @@

 import asyncio

+import os
 import pytest

 from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineGenerateError
+from ...utils import models_path_prefix

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]


 @pytest.mark.asyncio
@@ -66,4 +68,4 @@ async def test_async_llm_processor_error(model: str) -> None:
        generated_tokens.extend(out.outputs[0].token_ids)
    assert len(generated_tokens) == EXPECTED_TOKENS

-    async_llm.shutdown()
+    async_llm.shutdown()
\ No newline at end of file
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
 # SPDX-License-Identifier: Apache-2.0
 """Test that we handle a startup Error and shutdown."""

+import os
 import pytest

 from tests.utils import wait_for_gpu_memory_to_clear
@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
+from ...utils import models_path_prefix

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]


 def evil_method(self, *args, **kwargs):
@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
    Test profiling (forward()) and load weights failures.
    TODO(andy) - LLM without multiprocessing.
    """
-    if model != "meta-llama/Llama-3.2-1B":
+    if model != os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"):
        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
    if cuda_device_count_stateless() < tensor_parallel_size:
        pytest.skip(reason="Not enough CUDA devices")
@@ -94,4 +96,4 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
        wait_for_gpu_memory_to_clear(
            devices=list(range(tensor_parallel_size)),
            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
-        )
+        )
\ No newline at end of file
--- a/tests/v1/spec_decode/__init__.py
+++ b/tests/v1/spec_decode/__init__.py
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
 # SPDX-License-Identifier: Apache-2.0
 """Test whether spec decoding handles the max model length properly."""

+import os
 import pytest

 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix

 _PROMPTS = [
    "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
@@ -21,7 +23,7 @@ def test_ngram_max_len(
        m.setenv("VLLM_USE_V1", "1")

        llm = LLM(
-            model="facebook/opt-125m",
+            model=os.path.join(models_path_prefix, "facebook/opt-125m"),
            max_model_len=100,
            enforce_eager=True,  # For faster initialization.
            speculative_config={
@@ -44,11 +46,11 @@ def test_eagle_max_len(
        m.setenv("VLLM_USE_V1", "1")

        llm = LLM(
-            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
            enforce_eager=True,  # For faster initialization.
            speculative_config={
                "method": "eagle",
-                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
                "num_speculative_tokens": num_speculative_tokens,
            },
            max_model_len=100,

--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import numpy as np

 from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
 from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
                                                _find_subarray_kmp,
                                                _kmp_lps_array)
+from ...utils import models_path_prefix


 def test_kmp_lps_array():
@@ -43,10 +45,10 @@ def test_ngram_proposer():

    def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
        # Dummy model config. Just to set max_model_len.
-        model_config = ModelConfig(model="facebook/opt-125m",
+        model_config = ModelConfig(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
                                   task="generate",
                                   max_model_len=100,
-                                   tokenizer="facebook/opt-125m",
+                                   tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
                                   tokenizer_mode="auto",
                                   dtype="auto",
                                   seed=None,
@@ -86,4 +88,4 @@ def test_ngram_proposer():
    result = ngram_proposer(
        2, 4,
        2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
-    assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
+    assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
\ No newline at end of file
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import DPAsyncMPClient
+from ..utils import models_path_prefix

 engine_args = AsyncEngineArgs(
-    model="ibm-research/PowerMoE-3b",
+    model=os.path.join(models_path_prefix, "ibm-research/PowerMoE-3b"),
    enforce_eager=True,
    disable_log_requests=True,
    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
@@ -106,4 +107,4 @@ async def test_load(output_kind: RequestOutputKind):
            await asyncio.sleep(0.5)

        assert not core_client.engines_running
-        assert not core_client.reqs_in_flight
+        assert not core_client.reqs_in_flight
\ No newline at end of file
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -7,16 +7,17 @@ import vllm.envs as envs
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from ..utils import models_path_prefix

 UNSUPPORTED_MODELS_V1 = [
-    "openai/whisper-large-v3",  # transcription
-    "facebook/bart-large-cnn",  # encoder decoder
-    "mistralai/Mamba-Codestral-7B-v0.1",  # mamba
-    "ibm-ai-platform/Bamba-9B",  # hybrid
-    "BAAI/bge-m3",  # embedding
+    os.path.join(models_path_prefix, "openai/whisper-large-v3"),  # transcription
+    os.path.join(models_path_prefix, "facebook/bart-large-cnn"),  # encoder decoder
+    os.path.join(models_path_prefix, "mistralai/Mamba-Codestral-7B-v0.1"),  # mamba
+    os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"),  # hybrid
+    os.path.join(models_path_prefix, "BAAI/bge-m3"),  # embedding
 ]

-MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")


 @pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
@@ -160,4 +161,4 @@ def test_reject_using_constructor_directly(monkeypatch):
                           AsyncLLMEngine._get_executor_cls(vllm_config),
                           log_stats=True)

-        m.delenv("VLLM_USE_V1")
+        m.delenv("VLLM_USE_V1")
\ No newline at end of file
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
 # SPDX-License-Identifier: Apache-2.0
+import os
 import pytest

 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                       SchedulerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+from ...utils import models_path_prefix


 @pytest.fixture
@@ -17,9 +19,9 @@ def model_runner():
        max_model_len=512,
    )
    model_config = ModelConfig(
-        model="facebook/opt-125m",
+        model=os.path.join(models_path_prefix, "facebook/opt-125m"),
        task="generate",
-        tokenizer="facebook/opt-125m",
+        tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
        tokenizer_mode="auto",
        trust_remote_code=True,
        dtype="float16",
@@ -276,4 +278,4 @@ def test_update_states_request_unscheduled(model_runner):
    assert _is_req_scheduled(model_runner, req_ids[0])

    assert _is_req_added(model_runner, req_ids[1])
-    assert not _is_req_scheduled(model_runner, req_ids[1])
+    assert not _is_req_scheduled(model_runner, req_ids[1])
\ No newline at end of file