Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 38d80967 · 38d80967
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -683,6 +683,37 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
    )


+# Keye-VL-1.5
+def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1.5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Kimi-VL
 def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -1648,6 +1679,7 @@ model_example_map = {
    "interns1": run_interns1,
    "internvl_chat": run_internvl,
    "keye_vl": run_keye_vl,
+    "keye_vl1_5": run_keye_vl1_5,
    "kimi_vl": run_kimi_vl,
    "llama4": run_llama4,
    "llava": run_llava,

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -542,6 +542,43 @@ def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    )


+def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

@@ -1209,6 +1246,7 @@ model_example_map = {
    "interns1": load_interns1,
    "internvl_chat": load_internvl,
    "keye_vl": load_keye_vl,
+    "keye_vl1_5": load_keye_vl1_5,
    "kimi_vl": load_kimi_vl,
    "llama4": load_llama4,
    "llava": load_llava,

--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
    --gpu-memory-utilization 0.8 \
    --trust-remote-code \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &

 # decoding instance, which is the KV consumer
 CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
@@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
    --gpu-memory-utilization 0.8 \
    --trust-remote-code \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &

 # wait until prefill and decode instances are ready
 wait_for_server 8100

--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -6,6 +6,8 @@ import msgspec
 import zmq
 from msgspec.msgpack import Decoder

+from vllm.v1.core.kv_cache_utils import BlockHash
+

 #
 # Types copied from vllm.distributed.kv_events
@@ -22,8 +24,8 @@ class KVCacheEvent(


 class BlockStored(KVCacheEvent):
-    block_hashes: list[int]
-    parent_block_hash: Optional[int]
+    block_hashes: list[BlockHash]
+    parent_block_hash: Optional[BlockHash]
    token_ids: list[int]
    block_size: int
    lora_id: Optional[int]
@@ -31,7 +33,7 @@ class BlockStored(KVCacheEvent):


 class BlockRemoved(KVCacheEvent):
-    block_hashes: list[int]
+    block_hashes: list[BlockHash]
    medium: Optional[str]



--- a/examples/online_serving/multi-node-serving.sh
+++ b/examples/online_serving/multi-node-serving.sh
@@ -11,7 +11,7 @@
 # Example usage:
 # On the head node machine, start the Ray head node process and run a vLLM server.
 #   ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>]  && \
-#   python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
+#   vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2
 # 
 # On each worker node, start the Ray worker node process.
 #   ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]

--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -266,10 +266,52 @@ def run_audio(model: str) -> None:
    print("Chat completion output from base64 encoded audio:", result)


+def run_multi_audio(model: str) -> None:
+    from vllm.assets.audio import AudioAsset
+
+    # Two different audios to showcase batched inference.
+    audio_url = AudioAsset("winning_call").url
+    audio_base64 = encode_base64_content_from_url(audio_url)
+    audio_url2 = AudioAsset("azacinto_foscolo").url
+    audio_base64_2 = encode_base64_content_from_url(audio_url2)
+
+    # OpenAI-compatible schema (`input_audio`)
+    chat_completion_from_base64 = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Are these two audios the same?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
+                    },
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            "data": audio_base64_2,
+                            "format": "wav",
+                        },
+                    },
+                ],
+            }
+        ],
+        model=model,
+        max_completion_tokens=64,
+    )
+
+    result = chat_completion_from_base64.choices[0].message.content
+    print("Chat completion output from input audio:", result)
+
+
 example_function_map = {
    "text-only": run_text_only,
    "single-image": run_single_image,
    "multi-image": run_multi_image,
+    "multi-audio": run_multi_audio,
    "video": run_video,
    "audio": run_audio,
 }

--- a/examples/online_serving/prithvi_geospatial_mae.py
+++ b/examples/online_serving/prithvi_geospatial_mae.py
@@ -10,18 +10,19 @@ import requests
 # multimodal data. In this specific case this example will take a geotiff
 # image as input, process it using the multimodal data processor, and
 # perform inference.
-# Reuirements :
+# Requirements :
 # - install plugin at:
 #   https://github.com/christian-pinto/prithvi_io_processor_plugin
 # - start vllm in serving mode with the below args
 #   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
+#   --model-impl terratorch
 #   --task embed --trust-remote-code
 #   --skip-tokenizer-init --enforce-eager
-#   --io-processor-plugin prithvi_to_tiff_india
+#   --io-processor-plugin prithvi_to_tiff


 def main():
-    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501
    server_endpoint = "http://localhost:8000/pooling"

    request_payload_url = {
@@ -33,6 +34,7 @@ def main():
        },
        "priority": 0,
        "model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        "softmax": False,
    }

    ret = requests.post(server_endpoint, json=request_payload_url)

--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
@@ -402,7 +402,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "instant": false,
@@ -418,7 +418,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@@ -435,7 +435,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@@ -452,7 +452,7 @@
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@@ -468,7 +468,7 @@
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
-          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
+          "expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
          "hide": false,
          "instant": false,
          "legendFormat": "Mean",
@@ -476,7 +476,7 @@
          "refId": "E"
        }
      ],
-      "title": "Time Per Output Token Latency",
+      "title": "Inter Token Latency",
      "type": "timeseries"
    },
    {

--- a/examples/tool_chat_template_phi4_mini.jinja
+++ b/examples/tool_chat_template_phi4_mini.jinja
@@ -9,7 +9,7 @@
 <|system|>
 {{ system_message }}
 {%- if tools %}
-In addition to plain text responses, you can chose to call one or more of the provided functions.
+In addition to plain text responses, you can choose to call one or more of the provided functions.

 Use the following rule to decide when to call a function:
  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
@@ -19,7 +19,7 @@ If you decide to call functions:
  * prefix function calls with functools marker (no closing marker required)
  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
-  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * respect the argument type formatting. E.g., if the type is number and format is float, write value 7 as 7.0
  * make sure you pick the right functions that match the user intent



--- a/pyproject.toml
+++ b/pyproject.toml
@@ -228,6 +228,7 @@ fo = "fo"
 ba = "ba"

 [tool.typos.type.py.extend-words]
+ba = "ba"

 [tool.typos.type.cpp]
 extend-glob = ["*.cu"]
@@ -344,3 +345,6 @@ extend-ignore-re = []
 windo = "windo"

 [tool.typos.type.vimscript.extend-words]
+
+[tool.uv]
+no-build-isolation-package = ["torch"]
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -20,12 +20,11 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.11.3
 llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
-outlines_core == 0.2.10 ; platform_machine != "s390x"
-outlines == 0.1.11 ; platform_machine == "s390x"
+outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.21; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
+xgrammar == 0.1.23; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

--- a/requirements/neuron.txt
+++ b/requirements/neuron.txt
-# Common dependencies
-r common.txt
-
-# Dependencies for Neuron devices
-packaging>=24.2
-setuptools>=77.0.3,<80.0.0
-torch-neuronx >= 2.5.0
-neuronx-cc>=2.0.0a0
-torchvision # Required for Llama3.2 multimodal image preprocessing
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9'
 boto3
 botocore
 datasets
-ray>=2.10.0,<2.45.0
+ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
 peft
 pytest-asyncio
 tensorizer==2.10.1

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -21,6 +21,7 @@ ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline paralleli
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
+tblib # for pickling test exceptions
 timm >=1.0.17 # required for internvl and gemma3n-mm test
 torch==2.8.0
 torchaudio==2.8.0
@@ -53,5 +54,5 @@ runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
-terratorch==1.1rc2 # required for PrithviMAE test
 decord==0.6.0
+terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -137,7 +137,7 @@ contourpy==1.3.0
    # via matplotlib
 cramjam==2.9.0
    # via fastparquet
-cupy-cuda12x==13.3.0
+cupy-cuda12x==13.6.0
    # via ray
 cycler==0.12.1
    # via matplotlib
@@ -1032,6 +1032,8 @@ tabledata==1.3.3
    # via pytablewriter
 tabulate==0.9.0
    # via sacrebleu
+tblib==3.1.0
+    # via -r requirements/test.in
 tcolorpy==0.1.6
    # via pytablewriter
 tenacity==9.0.0
@@ -1042,7 +1044,7 @@ tensorboardx==2.6.4
    # via lightning
 tensorizer==2.10.1
    # via -r requirements/test.in
-terratorch==1.1rc2
+terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
    # via -r requirements/test.in
 threadpoolctl==3.5.0
    # via scikit-learn

--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,10 +10,10 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
--extra-index-url=https://download.pytorch.org/whl/xpu
+nixl==0.3.0 # for PD disaggregation
 torch==2.8.0+xpu
 torchaudio
 torchvision
-pytorch-triton-xpu
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.8.10+xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
+
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
--- a/setup.py
+++ b/setup.py
@@ -413,8 +413,7 @@ def _no_device() -> bool:

 def _is_cuda() -> bool:
    has_cuda = torch.version.cuda is not None
-    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu()))
+    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda and not _is_tpu())


 def _is_hip() -> bool:
@@ -422,10 +421,6 @@ def _is_hip() -> bool:
            or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None


-def _is_neuron() -> bool:
-    return VLLM_TARGET_DEVICE == "neuron"
-
-
 def _is_tpu() -> bool:
    return VLLM_TARGET_DEVICE == "tpu"

@@ -470,25 +465,6 @@ def get_rocm_version():
        return None


-def get_neuronxcc_version():
-    import sysconfig
-    site_dir = sysconfig.get_paths()["purelib"]
-    version_file = os.path.join(site_dir, "neuronxcc", "version",
-                                "__init__.py")
-
-    # Check if the command was executed successfully
-    with open(version_file) as fp:
-        content = fp.read()
-
-    # Extract the version using a regular expression
-    match = re.search(r"__version__ = '(\S+)'", content)
-    if match:
-        # Return the version string
-        return match.group(1)
-    else:
-        raise RuntimeError("Could not find Neuron version in the output")
-
-
 def get_nvcc_cuda_version() -> Version:
    """Get the CUDA version from nvcc.

@@ -541,12 +517,6 @@ def get_vllm_version() -> str:
        rocm_version = get_rocm_version() or torch.version.hip
        if rocm_version and rocm_version != MAIN_CUDA_VERSION:
            version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
-    elif _is_neuron():
-        # Get the Neuron version
-        neuron_version = str(get_neuronxcc_version())
-        if neuron_version != MAIN_CUDA_VERSION:
-            neuron_version_str = neuron_version.replace(".", "")[:3]
-            version += f"{sep}neuron{neuron_version_str}"
    elif _is_tpu():
        version += f"{sep}tpu"
    elif _is_cpu():
@@ -591,8 +561,6 @@ def get_requirements() -> list[str]:
        requirements = modified_requirements
    elif _is_hip():
        requirements = _read_requirements("rocm.txt")
-    elif _is_neuron():
-        requirements = _read_requirements("neuron.txt")
    elif _is_tpu():
        requirements = _read_requirements("tpu.txt")
    elif _is_cpu():
@@ -601,7 +569,7 @@ def get_requirements() -> list[str]:
        requirements = _read_requirements("xpu.txt")
    else:
        raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, or CPU.")
    return requirements


@@ -688,13 +656,15 @@ setup(
        "bench": ["pandas", "datasets"],
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
-        "runai":
-        ["runai-model-streamer >= 0.13.3", "runai-model-streamer-s3", "boto3"],
+        "runai": [
+            "runai-model-streamer >= 0.14.0", "runai-model-streamer-gcs",
+            "google-cloud-storage", "runai-model-streamer-s3", "boto3"
+        ],
        "audio": ["librosa", "soundfile",
                  "mistral_common[audio]"],  # Required for audio processing
        "video": [],  # Kept for backwards compatibility
        # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.14.post1"],
+        "flashinfer": ["flashinfer-python==0.3.0"],
        # Optional deps for AMD FP4 quantization support
        "petit-kernel": ["petit-kernel"],
    },

--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import copyreg
 import os
 import subprocess
 import sys
@@ -10,6 +11,30 @@ from pathlib import Path

 import pytest
 import requests
+import urllib3.exceptions
+
+
+def _pickle_new_connection_error(obj):
+    """Custom pickler for NewConnectionError to fix tblib compatibility."""
+    # Extract the original message by removing the "conn: " prefix
+    full_message = obj.args[0] if obj.args else ""
+    if ': ' in full_message:
+        # Split off the connection part and keep the actual message
+        _, actual_message = full_message.split(': ', 1)
+    else:
+        actual_message = full_message
+    return _unpickle_new_connection_error, (actual_message, )
+
+
+def _unpickle_new_connection_error(message):
+    """Custom unpickler for NewConnectionError."""
+    # Create with None as conn and the actual message
+    return urllib3.exceptions.NewConnectionError(None, message)
+
+
+# Register the custom pickle/unpickle functions for tblib compatibility
+copyreg.pickle(urllib3.exceptions.NewConnectionError,
+               _pickle_new_connection_error)


 def _query_server(prompt: str, max_tokens: int = 5) -> dict:
@@ -52,6 +77,7 @@ def api_server(distributed_executor_backend: str):
    uvicorn_process.terminate()


+@pytest.mark.timeout(300)
 @pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
 def test_api_server(api_server, distributed_executor_backend: str):
    """
@@ -98,7 +124,7 @@ def test_api_server(api_server, distributed_executor_backend: str):
        pool.join()

        # check cancellation stats
-        # give it some times to update the stats
+        # give it some time to update the stats
        time.sleep(1)

        num_aborted_requests = requests.get(

--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -45,3 +45,34 @@ def test_bench_serve(server):
    print(result.stderr)

    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
+
+@pytest.mark.benchmark
+def test_bench_serve_chat(server):
+    command = [
+        "vllm",
+        "bench",
+        "serve",
+        "--model",
+        MODEL_NAME,
+        "--host",
+        server.host,
+        "--port",
+        str(server.port),
+        "--dataset-name",
+        "random",
+        "--random-input-len",
+        "32",
+        "--random-output-len",
+        "4",
+        "--num-prompts",
+        "5",
+        "--endpoint",
+        "/v1/chat/completions",
+        "--endpoint-type",
+        "openai-chat",
+    ]
+    result = subprocess.run(command, capture_output=True, text=True)
+    print(result.stdout)
+    print(result.stderr)
+
+    assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
--- a/tests/compile/piecewise/test_full_cudagraph.py
+++ b/tests/compile/piecewise/test_full_cudagraph.py
@@ -61,6 +61,16 @@ backend_configs = {
                      "cudagraph_mode": "FULL_AND_PIECEWISE",
                  },
                  specific_gpu_arch=(9, 0)),
+    # FlashAttention MLA on Hopper
+    "FlashAttentionMLA":
+    BackendConfig(name="FlashAttentionMLA",
+                  env_vars={
+                      "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+                  },
+                  comp_config={
+                      "cudagraph_mode": "FULL_DECODE_ONLY",
+                  },
+                  specific_gpu_arch=(9, 0)),
    # Cutlass MLA on Blackwell
    "CutlassMLA":
    BackendConfig(
@@ -102,7 +112,7 @@ backend_configs = {
 test_params_full_cudagraph = []

 # deepseek-ai/DeepSeek-V2-Lite with MLA
-MLA_backends = ["FlashMLA", "CutlassMLA"]
+MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
 for mla_backend in MLA_backends:
    test_params_full_cudagraph.append(
        pytest.param(