Merge tag 'v0.14.0rc0' into v0.14.0rc0-ori

a810671a · zhuwenwen · 86b5aefe · 6a09612b · a810671a · a810671a
Commit a810671a authored Jan 08, 2026 by zhuwenwen
20 changed files
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -47,6 +47,8 @@ We currently support the following OpenAI APIs:
 - [Completions API](#completions-api) (`/v1/completions`)
    - Only applicable to [text generation models](../models/generative_models.md).
    - *Note: `suffix` parameter is not supported.*
+- [Responses API](#responses-api) (`/v1/responses`)
+    - Only applicable to [text generation models](../models/generative_models.md).
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
    - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
    - *Note: `user` parameter is ignored.*
@@ -229,6 +231,31 @@ The following extra parameters are supported:
    --8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
    ```
+### Responses API
+Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/online_serving/openai_responses_client_with_tools.py)
+#### Extra parameters
+The following extra parameters in the request object are supported:
+??? code
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:responses-extra-params"
+    ```
+The following extra parameters in the response object are supported:
+??? code
+    ```python
+    --8<-- "vllm/entrypoints/openai/protocol.py:responses-response-extra-params"
+    ```
 ### Embeddings API
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);

--- a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
+++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh
@@ -55,7 +55,6 @@ done
 echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS"
 export RAY_DEDUP_LOGS=0
-export VLLM_ALL2ALL_BACKEND="pplx"
 export VLLM_USE_DEEP_GEMM=1
 vllm serve $MODEL_NAME \
@@ -65,6 +64,7 @@ vllm serve $MODEL_NAME \
    --enforce-eager \
    --enable-expert-parallel \
    --enable-eplb \
+    --all2all-backend pplx \
    --num-redundant-experts $REDUNDANT_EXPERTS \
    --trust-remote-code \
    --host $HOST \

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<81.0.0",
    "setuptools-scm>=8.0",
-    "torch == 2.9.0",
+    "torch == 2.9.1",
    "wheel",
    "jinja2",
 ]

--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,7 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
-torch==2.9.0
+torch==2.9.1
 wheel
 jinja2>=3.1.6
 regex

--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.12.2 # required for compressed-tensors
+compressed-tensors == 0.13.0 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
@@ -50,5 +50,5 @@ ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
-model-hosting-container-standards >= 0.1.9, < 1.0.0
+model-hosting-container-standards >= 0.1.10, < 1.0.0
 mcp
\ No newline at end of file
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
 cmake>=3.26.1
 ninja
 packaging>=24.2
-setuptools>=77.0.3,<81.0.0
+setuptools==77.0.3 # this version can reuse CMake build dir
 setuptools-scm>=8
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"

--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
 # Common dependencies
 -r common.txt
+setuptools==77.0.3 # this version can reuse CMake build dir
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 # Dependencies for CPUs

--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -5,9 +5,9 @@ numba == 0.61.2 # Required for N-gram speculative decoding
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.9.0
+torch==2.9.1
-torchaudio==2.9.0
+torchaudio==2.9.1
 # These must be updated alongside torch
-torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision==0.24.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
 flashinfer-python==0.5.3
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -2,11 +2,11 @@
 -r common.txt
 --extra-index-url https://download.pytorch.org/whl/rocm6.4
-torch==2.9.0
+torch==2.9.1
-torchvision==0.24.0
+torchvision==0.24.1
-torchaudio==2.9.0
+torchaudio==2.9.1
-triton==3.5.0
+triton==3.5.1
 cmake>=3.26.1,<4
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -24,9 +24,9 @@ soundfile # required for audio tests
 jiwer # required for audio tests
 tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.9.0
+torch==2.9.1
-torchaudio==2.9.0
+torchaudio==2.9.1
-torchvision==0.24.0
+torchvision==0.24.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.5 # required for voxtral test

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1123,7 +1123,7 @@ tomli==2.2.1
    # via schemathesis
 tomli-w==1.2.0
    # via schemathesis
-torch==2.9.0+cu129
+torch==2.9.1+cu129
    # via
    #   -r requirements/test.in
    #   accelerate
@@ -1152,7 +1152,7 @@ torch==2.9.0+cu129
    #   torchvision
    #   vector-quantize-pytorch
    #   vocos
-torchaudio==2.9.0+cu129
+torchaudio==2.9.1+cu129
    # via
    #   -r requirements/test.in
    #   encodec
@@ -1165,7 +1165,7 @@ torchmetrics==1.7.4
    #   pytorch-lightning
    #   terratorch
    #   torchgeo
-torchvision==0.24.0+cu129
+torchvision==0.24.1+cu129
    # via
    #   -r requirements/test.in
    #   lightly
@@ -1206,7 +1206,7 @@ transformers==4.57.3
    #   transformers-stream-generator
 transformers-stream-generator==0.0.5
    # via -r requirements/test.in
-triton==3.5.0
+triton==3.5.1
    # via torch
 tritonclient==2.51.0
    # via

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -67,7 +67,6 @@ def _fix_prompt_embed_outputs(
 @pytest.mark.parametrize("model_executor", ["uni", "mp"])
 @pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models(
-    monkeypatch: pytest.MonkeyPatch,
    hf_runner,
    model: str,
    backend: str,
@@ -77,48 +76,46 @@ def test_models(
    model_executor: str,
    enable_prompt_embeds: bool,
 ) -> None:
-    with monkeypatch.context() as m:
+    # 5042 tokens for gemma2
-        m.setenv("VLLM_ATTENTION_BACKEND", backend)
+    # gemma2 has alternating sliding window size of 4096
+    # we need a prompt with more than 4096 tokens to test the sliding window
-        # 5042 tokens for gemma2
+    prompt = (
-        # gemma2 has alternating sliding window size of 4096
+        "The following numbers of the sequence "
-        # we need a prompt with more than 4096 tokens to test the sliding window
+        + ", ".join(str(i) for i in range(1024))
-        prompt = (
+        + " are:"
-            "The following numbers of the sequence "
+    )
-            + ", ".join(str(i) for i in range(1024))
+    example_prompts = [prompt]
-            + " are:"
-        )
+    with hf_runner(model) as hf_model:
-        example_prompts = [prompt]
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        if enable_prompt_embeds:
-        with hf_runner(model) as hf_model:
+            with torch.no_grad():
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+                prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
-            if enable_prompt_embeds:
-                with torch.no_grad():
+    with VllmRunner(
-                    prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
+        model,
+        max_model_len=8192,
-        with VllmRunner(
+        enforce_eager=enforce_eager,
-            model,
+        enable_prompt_embeds=enable_prompt_embeds,
-            max_model_len=8192,
+        gpu_memory_utilization=0.7,
-            enforce_eager=enforce_eager,
+        async_scheduling=async_scheduling,
-            enable_prompt_embeds=enable_prompt_embeds,
+        distributed_executor_backend=model_executor,
-            gpu_memory_utilization=0.7,
+        attention_config={"backend": backend},
-            async_scheduling=async_scheduling,
+    ) as vllm_model:
-            distributed_executor_backend=model_executor,
+        if enable_prompt_embeds:
-        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
-            if enable_prompt_embeds:
+            vllm_outputs = _fix_prompt_embed_outputs(
-                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
+                vllm_outputs, hf_model, example_prompts
-                vllm_outputs = _fix_prompt_embed_outputs(
+            )
-                    vllm_outputs, hf_model, example_prompts
+        else:
-                )
+            vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-            else:
-                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        check_outputs_equal(
+    check_outputs_equal(
-            outputs_0_lst=hf_outputs,
+        outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
+        outputs_1_lst=vllm_outputs,
-            name_0="hf",
+        name_0="hf",
-            name_1="vllm",
+        name_1="vllm",
-        )
+    )
 @multi_gpu_test(num_gpus=2)
@@ -161,12 +158,6 @@ def test_models_distributed(
        ):  # noqa
            pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
-        if attention_backend:
-            monkeypatch_context.setenv(
-                "VLLM_ATTENTION_BACKEND",
-                attention_backend,
-            )
        for k, v in extra_env.items():
            monkeypatch_context.setenv(k, v)
@@ -178,6 +169,7 @@ def test_models_distributed(
        # if we run HF first, the cuda initialization will be done and it
        # will hurt multiprocessing backend with fork method
        # (the default method).
+        attention_config = {"backend": attention_backend} if attention_backend else None
        with vllm_runner(
            model,
            dtype=dtype,
@@ -185,6 +177,7 @@ def test_models_distributed(
            distributed_executor_backend=distributed_executor_backend,
            enable_prompt_embeds=enable_prompt_embeds,
            gpu_memory_utilization=0.7,
+            attention_config=attention_config,
        ) as vllm_model:
            if enable_prompt_embeds:
                with hf_runner(model, dtype=dtype) as hf_model:

--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -19,21 +19,18 @@ def server():
 @pytest.mark.benchmark
 def test_bench_serve(server):
+    # Test default model detection and input/output len
    command = [
        "vllm",
        "bench",
        "serve",
-        "--model",
-        MODEL_NAME,
        "--host",
        server.host,
        "--port",
        str(server.port),
-        "--dataset-name",
+        "--input-len",
-        "random",
-        "--random-input-len",
        "32",
-        "--random-output-len",
+        "--output-len",
        "4",
        "--num-prompts",
        "5",

--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -208,7 +208,8 @@ def test_attn_quant(
    # To capture subprocess logs, we need to know whether spawn or fork is used.
    # Force spawn as it is more general.
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+    model_kwargs["attention_config"] = {"backend": backend.name}
    compilation_config = CompilationConfig(
        # Testing properties
@@ -297,7 +298,8 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
    # To capture subprocess logs, we need to know whether spawn or fork is used.
    # Force spawn as it is more general.
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+    model_kwargs["attention_config"] = {"backend": backend.name}
    compilation_config = CompilationConfig(
        # Testing properties
@@ -409,7 +411,8 @@ def test_tp2_attn_quant_async_tp(
    # To capture subprocess logs, we need to know whether spawn or fork is used.
    # Force spawn as it is more general.
    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+    model_kwargs["attention_config"] = {"backend": backend.name}
    compilation_config = CompilationConfig(
        # Testing properties
@@ -564,7 +567,9 @@ def test_rms_group_quant(
        splitting_ops=splitting_ops,
        # Common
        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
+        pass_config=PassConfig(
+            fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
+        ),
        # Inductor caches custom passes by default as well via uuid
        inductor_compile_config={"force_disable_caches": True},
    )

--- a/tests/compile/fullgraph/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -89,7 +89,6 @@ class TestSetting:
    ],
 )
 def test_compile_correctness(
-    monkeypatch: pytest.MonkeyPatch,
    test_setting: TestSetting,
 ):
    # this test is run under multiple suits, with different GPUs.
@@ -107,49 +106,48 @@ def test_compile_correctness(
            f"{cuda_device_count_stateless()}"
        )
-    with monkeypatch.context() as m:
+    final_args = [
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+        *model_args,
-        final_args = [
+        "-pp",
-            *model_args,
+        str(pp_size),
-            "-pp",
+        "-tp",
-            str(pp_size),
+        str(tp_size),
-            "-tp",
+        "-cc.cudagraph_mode=none",
-            str(tp_size),
+        f"--attention-backend={attn_backend}",
-            "-cc.cudagraph_mode=none",
+    ]
-        ]
-        all_args: list[list[str]] = []
+    all_args: list[list[str]] = []
-        all_envs: list[dict[str, str] | None] = []
+    all_envs: list[dict[str, str] | None] = []
-        for comp_mode in [
+    for comp_mode in [
-            CompilationMode.STOCK_TORCH_COMPILE,
+        CompilationMode.STOCK_TORCH_COMPILE,
-            CompilationMode.DYNAMO_TRACE_ONCE,
+        CompilationMode.DYNAMO_TRACE_ONCE,
-            CompilationMode.VLLM_COMPILE,
+        CompilationMode.VLLM_COMPILE,
-        ]:
+    ]:
-            for mode in [CompilationMode.NONE, comp_mode]:
+        for mode in [CompilationMode.NONE, comp_mode]:
-                all_args.append(
+            all_args.append(
-                    final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
+                final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
-                )
-            # inductor will change the output, so we only compare if the output
-            # is close, not exactly the same.
-            compare_all_settings(
-                model,
-                all_args,
-                all_envs,
-                method=method if method != "generate" else "generate_close",
            )
-            all_envs.clear()
-            all_args.clear()
-        for mode in [
+        # inductor will change the output, so we only compare if the output
-            CompilationMode.NONE,
+        # is close, not exactly the same.
-            CompilationMode.STOCK_TORCH_COMPILE,
+        compare_all_settings(
-            CompilationMode.DYNAMO_TRACE_ONCE,
+            model,
-            CompilationMode.VLLM_COMPILE,
+            all_args,
-        ]:
+            all_envs,
-            all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
+            method=method if method != "generate" else "generate_close",
-            all_envs.append({})
+        )
-            all_envs.append({})
+        all_envs.clear()
+        all_args.clear()
+    for mode in [
+        CompilationMode.NONE,
+        CompilationMode.STOCK_TORCH_COMPILE,
+        CompilationMode.DYNAMO_TRACE_ONCE,
+        CompilationMode.VLLM_COMPILE,
+    ]:
+        all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
+        all_envs.append({})
+        all_envs.append({})
-        compare_all_settings(model, all_args * 3, all_envs, method=method)
+    compare_all_settings(model, all_args * 3, all_envs, method=method)
--- a/tests/compile/fullgraph/test_full_cudagraph.py
+++ b/tests/compile/fullgraph/test_full_cudagraph.py
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py