Merge branch 'v0.8.5.post1-dev' into v0.8.5-zero_overhead

bd363067 · lizhigong · 87ef4618 · d36deb1a · bd363067 · bd363067
Commit bd363067 authored Jun 05, 2025 by lizhigong
20 changed files
--- a/tests/v1/shutdown/test_delete.py
+++ b/tests/v1/shutdown/test_delete.py
 # SPDX-License-Identifier: Apache-2.0
 """Test that we handle a startup Error and shutdown."""
+import os
 import pytest
 from tests.utils import wait_for_gpu_memory_to_clear
@@ -11,8 +12,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.sampling_params import RequestOutputKind
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
+from ...utils import models_path_prefix
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
 @pytest.mark.asyncio
@@ -94,4 +96,4 @@ def test_llm_delete(monkeypatch, model: str, tensor_parallel_size: int,
        wait_for_gpu_memory_to_clear(
            devices=list(range(tensor_parallel_size)),
            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
        )
\ No newline at end of file
--- a/tests/v1/shutdown/test_forward_error.py
+++ b/tests/v1/shutdown/test_forward_error.py
@@ -3,6 +3,7 @@
 import asyncio
+import os
 import pytest
 from tests.utils import wait_for_gpu_memory_to_clear
@@ -14,8 +15,9 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineDeadError
+from ...utils import models_path_prefix
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
 def evil_forward(self, *args, **kwargs):
@@ -126,4 +128,4 @@ def test_llm_model_error(monkeypatch, tensor_parallel_size: int,
        wait_for_gpu_memory_to_clear(
            devices=list(range(tensor_parallel_size)),
            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
        )
\ No newline at end of file
--- a/tests/v1/shutdown/test_processor_error.py
+++ b/tests/v1/shutdown/test_processor_error.py
@@ -3,6 +3,7 @@
 import asyncio
+import os
 import pytest
 from tests.v1.shutdown.utils import SHUTDOWN_TEST_TIMEOUT_SEC
@@ -12,8 +13,9 @@ from vllm.inputs.data import TokensPrompt
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.exceptions import EngineGenerateError
+from ...utils import models_path_prefix
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
 @pytest.mark.asyncio
@@ -66,4 +68,4 @@ async def test_async_llm_processor_error(model: str) -> None:
        generated_tokens.extend(out.outputs[0].token_ids)
    assert len(generated_tokens) == EXPECTED_TOKENS
    async_llm.shutdown()
\ No newline at end of file
--- a/tests/v1/shutdown/test_startup_error.py
+++ b/tests/v1/shutdown/test_startup_error.py
 # SPDX-License-Identifier: Apache-2.0
 """Test that we handle a startup Error and shutdown."""
+import os
 import pytest
 from tests.utils import wait_for_gpu_memory_to_clear
@@ -12,8 +13,9 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.utils import cuda_device_count_stateless
 from vllm.v1.engine.async_llm import AsyncLLM
+from ...utils import models_path_prefix
-MODELS = ["meta-llama/Llama-3.2-1B"]
+MODELS = [os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
 def evil_method(self, *args, **kwargs):
@@ -69,7 +71,7 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
    Test profiling (forward()) and load weights failures.
    TODO(andy) - LLM without multiprocessing.
    """
-    if model != "meta-llama/Llama-3.2-1B":
+    if model != os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"):
        pytest.skip(reason="Only test meta-llama/Llama-3.2-1B")
    if cuda_device_count_stateless() < tensor_parallel_size:
        pytest.skip(reason="Not enough CUDA devices")
@@ -94,4 +96,4 @@ def test_llm_startup_error(monkeypatch, model: str, tensor_parallel_size: int,
        wait_for_gpu_memory_to_clear(
            devices=list(range(tensor_parallel_size)),
            threshold_bytes=SHUTDOWN_TEST_THRESHOLD_BYTES,
        )
\ No newline at end of file
--- a/tests/v1/spec_decode/__init__.py
+++ b/tests/v1/spec_decode/__init__.py
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
 # SPDX-License-Identifier: Apache-2.0
 """Test whether spec decoding handles the max model length properly."""
+import os
 import pytest
 from vllm import LLM, SamplingParams
+from ...utils import models_path_prefix
 _PROMPTS = [
    "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1",
@@ -21,7 +23,7 @@ def test_ngram_max_len(
        m.setenv("VLLM_USE_V1", "1")
        llm = LLM(
-            model="facebook/opt-125m",
+            model=os.path.join(models_path_prefix, "facebook/opt-125m"),
            max_model_len=100,
            enforce_eager=True,  # For faster initialization.
            speculative_config={
@@ -44,11 +46,11 @@ def test_eagle_max_len(
        m.setenv("VLLM_USE_V1", "1")
        llm = LLM(
-            model="meta-llama/Meta-Llama-3-8B-Instruct",
+            model=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
            enforce_eager=True,  # For faster initialization.
            speculative_config={
                "method": "eagle",
-                "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+                "model": os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
                "num_speculative_tokens": num_speculative_tokens,
            },
            max_model_len=100,

--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
 # SPDX-License-Identifier: Apache-2.0
+import os
 import numpy as np
 from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
 from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
                                                _find_subarray_kmp,
                                                _kmp_lps_array)
+from ...utils import models_path_prefix
 def test_kmp_lps_array():
@@ -43,10 +45,10 @@ def test_ngram_proposer():
    def ngram_proposer(min_n: int, max_n: int, k: int) -> NgramProposer:
        # Dummy model config. Just to set max_model_len.
-        model_config = ModelConfig(model="facebook/opt-125m",
+        model_config = ModelConfig(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
                                   task="generate",
                                   max_model_len=100,
-                                   tokenizer="facebook/opt-125m",
+                                   tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
                                   tokenizer_mode="auto",
                                   dtype="auto",
                                   seed=None,
@@ -86,4 +88,4 @@ def test_ngram_proposer():
    result = ngram_proposer(
        2, 4,
        2).propose(context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]))
    assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
\ No newline at end of file
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -14,9 +14,10 @@ from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import DPAsyncMPClient
+from ..utils import models_path_prefix
 engine_args = AsyncEngineArgs(
-    model="ibm-research/PowerMoE-3b",
+    model=os.path.join(models_path_prefix, "ibm-research/PowerMoE-3b"),
    enforce_eager=True,
    disable_log_requests=True,
    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
@@ -106,4 +107,4 @@ async def test_load(output_kind: RequestOutputKind):
            await asyncio.sleep(0.5)
        assert not core_client.engines_running
        assert not core_client.reqs_in_flight
\ No newline at end of file
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -7,16 +7,17 @@ import vllm.envs as envs
 from vllm import LLM
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from ..utils import models_path_prefix
 UNSUPPORTED_MODELS_V1 = [
-    "openai/whisper-large-v3",  # transcription
+    os.path.join(models_path_prefix, "openai/whisper-large-v3"),  # transcription
-    "facebook/bart-large-cnn",  # encoder decoder
+    os.path.join(models_path_prefix, "facebook/bart-large-cnn"),  # encoder decoder
-    "mistralai/Mamba-Codestral-7B-v0.1",  # mamba
+    os.path.join(models_path_prefix, "mistralai/Mamba-Codestral-7B-v0.1"),  # mamba
-    "ibm-ai-platform/Bamba-9B",  # hybrid
+    os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"),  # hybrid
-    "BAAI/bge-m3",  # embedding
+    os.path.join(models_path_prefix, "BAAI/bge-m3"),  # embedding
 ]
-MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
 @pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
@@ -160,4 +161,4 @@ def test_reject_using_constructor_directly(monkeypatch):
                           AsyncLLMEngine._get_executor_cls(vllm_config),
                           log_stats=True)
        m.delenv("VLLM_USE_V1")
\ No newline at end of file
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
--- a/tests/v1/tpu/test_mha_attn.py
+++ b/tests/v1/tpu/test_mha_attn.py
--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
 # SPDX-License-Identifier: Apache-2.0
+import os
 import pytest
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
@@ -7,6 +8,7 @@ from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                       SchedulerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+from ...utils import models_path_prefix
 @pytest.fixture
@@ -17,9 +19,9 @@ def model_runner():
        max_model_len=512,
    )
    model_config = ModelConfig(
-        model="facebook/opt-125m",
+        model=os.path.join(models_path_prefix, "facebook/opt-125m"),
        task="generate",
-        tokenizer="facebook/opt-125m",
+        tokenizer=os.path.join(models_path_prefix, "facebook/opt-125m"),
        tokenizer_mode="auto",
        trust_remote_code=True,
        dtype="float16",
@@ -276,4 +278,4 @@ def test_update_states_request_unscheduled(model_runner):
    assert _is_req_scheduled(model_runner, req_ids[0])
    assert _is_req_added(model_runner, req_ids[1])
    assert not _is_req_scheduled(model_runner, req_ids[1])
\ No newline at end of file
--- a/tests/weight_loading/__init__.py
+++ b/tests/weight_loading/__init__.py
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py