Merge tag 'v0.9.1' into v0.9.1-ori

cc7f22a8 · zhuwenwen · b9ea0c09 · b6553be1 · cc7f22a8 · cc7f22a8
Commit cc7f22a8 authored Jun 11, 2025 by zhuwenwen
20 changed files
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -2,7 +2,7 @@
 -r common.txt

 ray>=2.9
-cmake>=3.26
+cmake>=3.26.1
 packaging>=24.2
 setuptools-scm>=8
 setuptools>=77.0.3,<80.0.0

--- a/setup.py
+++ b/setup.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import ctypes
 import importlib.util
 import json
 import logging
 import os
+import re
 import subprocess
 import sys
 from pathlib import Path
 from shutil import which

-import regex as re
 import torch
 from packaging.version import Version, parse
 from setuptools import Extension, setup
@@ -251,11 +252,8 @@ class cmake_build_ext(build_ext):

            # CMake appends the extension prefix to the install path,
            # and outdir already contains that prefix, so we need to remove it.
-            # We assume only the final component of extension prefix is added by
-            # CMake, this is currently true for current extensions but may not
-            # always be the case.
            prefix = outdir
-            if '.' in ext.name:
+            for _ in range(ext.name.count('.')):
                prefix = prefix.parent

            # prefix here should actually be the same for all components
@@ -690,6 +688,7 @@ setup(
    ext_modules=ext_modules,
    install_requires=get_requirements(),
    extras_require={
+        "bench": ["pandas", "datasets"],
        "tensorizer": ["tensorizer>=2.9.0"],
        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],

--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """vllm.entrypoints.api_server with some extra logging for testing."""
 from collections.abc import Iterable
 from typing import Any
@@ -7,6 +8,7 @@ import uvicorn
 from fastapi.responses import JSONResponse, Response

 import vllm.entrypoints.api_server
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.utils import FlexibleArgumentParser
@@ -45,9 +47,8 @@ if __name__ == "__main__":
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
    vllm.entrypoints.api_server.engine = engine
-    uvicorn.run(
-        app,
-        host=args.host,
-        port=args.port,
-        log_level="debug",
-        timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="debug",
+                timeout_keep_alive=envs.VLLM_HTTP_TIMEOUT_KEEP_ALIVE)
--- a/tests/async_engine/conftest.py
+++ b/tests/async_engine/conftest.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest



--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import os
 import subprocess

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import asyncio
 import os
@@ -383,3 +384,25 @@ async def test_delayed_generator(async_engine, stop):
    assert final_output is not None
    assert len(final_output.outputs[0].token_ids) == 10
    assert final_output.finished
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_invalid_argument(async_engine):
+    scheduler_config = await async_engine.get_scheduler_config()
+
+    if scheduler_config.num_scheduler_steps != 1:
+        pytest.skip("no need to test this one with multistep")
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+    )
+
+    # Targeting specific DP rank only supported in v1 multi-instance DP
+    with pytest.raises(ValueError):
+        async for _ in async_engine.generate("test",
+                                             sampling_params,
+                                             request_id=uid(),
+                                             data_parallel_rank=0):
+            pass
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest


--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the short outputs of HF and vLLM when using greedy sampling.

 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
@@ -60,7 +61,6 @@ def _fix_prompt_embed_outputs(

 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
-@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
@@ -69,7 +69,6 @@ def test_models(
    hf_runner,
    model: str,
    backend: str,
-    dtype: str,
    max_tokens: int,
    enforce_eager: bool,
    enable_prompt_embeds: bool,
@@ -97,7 +96,7 @@ def test_models(
            str(i) for i in range(1024)) + " are:"
        example_prompts = [prompt]

-        with hf_runner(model, dtype=dtype) as hf_model:
+        with hf_runner(model) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
            if enable_prompt_embeds:
                with torch.no_grad():
@@ -106,7 +105,6 @@ def test_models(

        with VllmRunner(model,
                        max_model_len=8192,
-                        dtype=dtype,
                        enforce_eager=enforce_eager,
                        enable_prompt_embeds=enable_prompt_embeds,
                        gpu_memory_utilization=0.7) as vllm_model:
@@ -130,15 +128,21 @@ def test_models(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
    "model, distributed_executor_backend, attention_backend, "
-    "test_suite", [
-        ("distilbert/distilgpt2", "ray", "", "L4"),
-        ("distilbert/distilgpt2", "mp", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
-        ("distilbert/distilgpt2", "ray", "", "A100"),
-        ("distilbert/distilgpt2", "mp", "", "A100"),
-        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    "test_suite, extra_env", [
+        ("distilbert/distilgpt2", "ray", "", "L4", {}),
+        ("distilbert/distilgpt2", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("distilbert/distilgpt2", "mp", "", "L4", {
+            "VLLM_SLEEP_WHEN_IDLE": "1"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
+        ("distilbert/distilgpt2", "ray", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "", "A100", {}),
+        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100", {}),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100", {}),
    ])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
@@ -150,6 +154,7 @@ def test_models_distributed(
    distributed_executor_backend: str,
    attention_backend: str,
    test_suite: str,
+    extra_env: dict[str, str],
    enable_prompt_embeds: bool,
 ) -> None:

@@ -175,6 +180,9 @@ def test_models_distributed(
                attention_backend,
            )

+        for k, v in extra_env.items():
+            monkeypatch_context.setenv(k, v)
+
        dtype = "half"
        max_tokens = 5


--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the outputs of HF and vLLM when using greedy sampling.

 It tests chunked prefill. Chunked prefill can be enabled by

--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from ..utils import compare_two_settings


--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import pytest
 import torch

--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Compare the short outputs of HF and vLLM when using greedy sampling.

 VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test.

--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess

 import pytest

--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess

 import pytest

--- a/tests/benchmarks/test_throughput_cli.py
+++ b/tests/benchmarks/test_throughput_cli.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess

 import pytest

--- a/tests/build_cython.py
+++ b/tests/build_cython.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import Cython.Compiler.Options
 from Cython.Build import cythonize
 from setuptools import setup

--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from copy import deepcopy
 from typing import Callable, Union

--- a/tests/compile/conftest.py
+++ b/tests/compile/conftest.py
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-
-
-# TEST V1: this should be removed. Right now V1 overrides
-# all the torch compile logic. We should re-enable this
-# as we add torch compile support back to V1.
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Since this module is V0 only, set VLLM_USE_V1=0 for
-    all tests in the module.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import os

@@ -6,6 +7,7 @@ import pytest

 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
+from vllm.platforms import current_platform

 MODEL = "Qwen/Qwen2-1.5B-Instruct"

@@ -36,7 +38,7 @@ def full_cudagraph_llm():
            "VLLM_FLASH_ATTN_VERSION": "3"
    }):
        return LLM(model=MODEL,
-                   gpu_memory_utilization=0.2,
+                   gpu_memory_utilization=0.3,
                   compilation_config=CompilationConfig(full_cuda_graph=True))


@@ -47,7 +49,7 @@ def piecewise_llm():
            "VLLM_FLASH_ATTN_VERSION": "3"
    }):
        return LLM(model=MODEL,
-                   gpu_memory_utilization=0.5,
+                   gpu_memory_utilization=0.6,
                   compilation_config=CompilationConfig())


@@ -60,6 +62,8 @@ def generate_text(llm: LLM, batch_size: int, max_tokens: int):
    return llm.generate(prompts, sampling_params)


+@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0),
+                    reason="Only Hopper GPUs support FlashAttention 3")
 @pytest.mark.parametrize(("batch_size", "max_tokens"), [(1, 10), (7, 10),
                                                        (16, 10), (25, 10),
                                                        (32, 10), (45, 10),

--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Test the piecewise compilation with a simple model so that we
 can exactly calculate the expected output and side effects.
@@ -12,6 +13,7 @@ from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
                         set_current_vllm_config)
+from vllm.envs import VLLM_USE_V1
 from vllm.utils import direct_register_custom_op

 global_counter = 0
@@ -74,11 +76,13 @@ class SillyModel(nn.Module):
        return x


-def test_simple_piecewise_compile():
+def _test_simple_piecewise_compile(*, use_inductor):
+    assert VLLM_USE_V1

    vllm_config = VllmConfig(compilation_config=CompilationConfig(
        level=CompilationLevel.PIECEWISE,
        use_cudagraph=True,
+        use_inductor=use_inductor,
        splitting_ops=["silly.attention"],
        cudagraph_copy_inputs=True,
        cudagraph_capture_sizes=[1, 2],
@@ -93,7 +97,7 @@ def test_simple_piecewise_compile():
            num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
            num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
-            num_cudagraph_caputured=
+            num_cudagraph_captured=
            6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
    ):

@@ -108,3 +112,11 @@ def test_simple_piecewise_compile():
        output = model(input)
        assert global_counter == 2
        assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
+
+
+def test_simple_piecewise_compile_inductor():
+    _test_simple_piecewise_compile(use_inductor=True)
+
+
+def test_simple_piecewise_compile_no_inductor():
+    _test_simple_piecewise_compile(use_inductor=False)