Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead

bd363067 · lizhigong · 87ef4618 · d36deb1a · bd363067 · bd363067
Commit bd363067 authored Jun 05, 2025 by lizhigong
20 changed files
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
 # SPDX-License-Identifier: Apache-2.0
 """Test that we handle a startup Error and shutdown."""

+import os
 import pytest

 from tests.utils import wait_for_gpu_memory_to_clear
@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.sampling_params import RequestOutputKind
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
+from ...utils import models_path_prefix

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]


 @pytest.mark.asyncio

--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -3,6 +3,7 @@

 import asyncio

+import os
 import pytest

 from tests.utils import wait_for_gpu_memory_to_clear
@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError
+from ...utils import models_path_prefix

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]


 def evil_forward(self, *args, **kwargs):

--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -3,6 +3,7 @@

 import asyncio

+import os
 import pytest

 from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineGenerateError
+from ...utils import models_path_prefix

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]


 @pytest.mark.asyncio

--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
 # SPDX-License-Identifier: Apache-2.0
 """Test that we handle a startup Error and shutdown."""

+import os
 import pytest

 from tests.utils import wait_for_gpu_memory_to_clear
@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
+from ...utils import models_path_prefix

-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]


 def evil_method(self, *args, **kwargs):
@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
    Test profiling (forward()) and load weights failures.
    TODO(andy) - LLM without multiprocessing.
    """
-    if model != "meta-llama/Llama-3.2-1B":
+    if model != os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"):
        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
    if cuda_device_count_stateless() < tensor_parallel_size:
        pytest.skip(reason="Not enough CUDA devices")

--- a/tests/v1/spec_decode/__init__.py
+++ b/tests/v1/spec_decode/__init__.py
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
 # SPDX-License-Identifier: Apache-2.0
 """Test whether spec decoding handles the max model length properly."""

+import os
 import pytest

 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix

 _PROMPTS = [
    "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
@@ -21,7 +23,7 @@ def test_ngram_max_len(
        m.setenv("VLLM_USE_V1", "1")

        llm = LLM(
-            model="facebook/opt-125m",
+            model=os.path.join(models_path_prefix, "facebook/opt-125m"),
            max_model_len=100,
            enforce_eager=True,  # For faster initialization.
            speculative_config={
@@ -44,11 +46,11 @@ def test_eagle_max_len(
        m.setenv("VLLM_USE_V1", "1")

        llm = LLM(
-            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
            enforce_eager=True,  # For faster initialization.
            speculative_config={
                "method": "eagle",
-                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
                "num_speculative_tokens": num_speculative_tokens,
            },
            max_model_len=100,

--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import numpy as np

 from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
 from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
                                                _find_subarray_kmp,
                                                _kmp_lps_array)
+from ...utils import models_path_prefix


 def test_kmp_lps_array():
@@ -43,10 +45,10 @@ def test_ngram_proposer():

    def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
        # Dummy model config. Just to set max_model_len.
-        model_config = ModelConfig(model="facebook/opt-125m",
+        model_config = ModelConfig(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
                                   task="generate",
                                   max_model_len=100,
-                                   tokenizer="facebook/opt-125m",
+                                   tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
                                   tokenizer_mode="auto",
                                   dtype="auto",
                                   seed=None,

--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import DPAsyncMPClient
+from ..utils import models_path_prefix

 engine_args = AsyncEngineArgs(
-    model="ibm-research/PowerMoE-3b",
+    model=os.path.join(models_path_prefix, "ibm-research/PowerMoE-3b"),
    enforce_eager=True,
    disable_log_requests=True,
    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),

--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -7,16 +7,17 @@ import vllm.envs as envs
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from ..utils import models_path_prefix

 UNSUPPORTED_MODELS_V1 = [
-    "openai/whisper-large-v3",  # transcription
-    "facebook/bart-large-cnn",  # encoder decoder
-    "mistralai/Mamba-Codestral-7B-v0.1",  # mamba
-    "ibm-ai-platform/Bamba-9B",  # hybrid
-    "BAAI/bge-m3",  # embedding
+    os.path.join(models_path_prefix, "openai/whisper-large-v3"),  # transcription
+    os.path.join(models_path_prefix, "facebook/bart-large-cnn"),  # encoder decoder
+    os.path.join(models_path_prefix, "mistralai/Mamba-Codestral-7B-v0.1"),  # mamba
+    os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"),  # hybrid
+    os.path.join(models_path_prefix, "BAAI/bge-m3"),  # embedding
 ]

-MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")


 @pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)

--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
 # SPDX-License-Identifier: Apache-2.0
+import os
 import pytest

 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                       SchedulerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+from ...utils import models_path_prefix


 @pytest.fixture
@@ -17,9 +19,9 @@ def model_runner():
        max_model_len=512,
    )
    model_config = ModelConfig(
-        model="facebook/opt-125m",
+        model=os.path.join(models_path_prefix, "facebook/opt-125m"),
        task="generate",
-        tokenizer="facebook/opt-125m",
+        tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
        tokenizer_mode="auto",
        trust_remote_code=True,
        dtype="float16",

--- a/tests/weight_loading/__init__.py
+++ b/tests/weight_loading/__init__.py
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py