Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 38d80967 · 38d80967
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,7 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
-# triton jit 
+# triton jit
 .triton
 # Byte-compiled / optimized / DLL files
@@ -177,6 +177,14 @@ cython_debug/
 # VSCode
 .vscode/
+# Claude
+CLAUDE.md
+.claude/
+# Codex
+AGENTS.md
+.codex/
 # DS Store
 .DS_Store
@@ -209,4 +217,4 @@ shellcheck*/
 csrc/moe/marlin_moe_wna16/kernel_*
 # Ignore ep_kernels_workspace folder
 ep_kernels_workspace/
\ No newline at end of file
--- a/.yapfignore
+++ b/.yapfignore
 collect_env.py
+vllm/model_executor/layers/fla/ops/*.py
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,7 +2,6 @@ include LICENSE
 include requirements/common.txt
 include requirements/cuda.txt
 include requirements/rocm.txt
-include requirements/neuron.txt
 include requirements/cpu.txt
 include CMakeLists.txt

--- a/README.md
+++ b/README.md
@@ -14,19 +14,24 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
+---
+Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
 ---
 *Latest News* 🔥
+- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
+- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 <details>
 <summary>Previous News</summary>
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -95,6 +95,24 @@ become available.
      <td style="text-align: center;">✅</td>
      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
    </tr>
+    <tr>
+      <td><strong>HuggingFace-MTBench</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>philschmid/mt-bench</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-Blazedit</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
+    </tr>
+    <tr>
+      <td><strong>Spec Bench</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
+    </tr>
    <tr>
      <td><strong>Custom</strong></td>
      <td style="text-align: center;">✅</td>
@@ -110,7 +128,12 @@ become available.
 🚧: to be supported
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
+```bash
+--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
+```
 ## 🚀 Example - Online Benchmark
@@ -234,6 +257,43 @@ vllm bench serve \
    --num-prompts 2048
 ```
+### Spec Bench Benchmark with Speculative Decoding
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-config $'{"method": "ngram",
+    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
+    "prompt_lookup_min": 2}'
+```
+[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
+Run all categories:
+``` bash
+# Download the dataset using:
+# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \ 
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+```
+Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
+Run only a specific category like "summarization":
+``` bash
+vllm bench serve \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name spec_bench \ 
+    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
+    --num-prompts -1
+    --spec-bench-category "summarization"
+```
 ### Other HuggingFaceDataset Examples
 ```bash
@@ -290,6 +350,18 @@ vllm bench serve \
    --num-prompts 80
 ```
+`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
+``` bash
+vllm bench serve \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path vdaita/edit_5k_char \
+    --num-prompts 90 \
+    --blazedit-min-distance 0.01 \
+    --blazedit-max-distance 0.99
+```
 ### Running With Sampling Parameters
 When using OpenAI-compatible backends such as `vllm`, optional sampling
@@ -689,7 +761,7 @@ python -m vllm.entrypoints.openai.api_server \
 Send requests with images:
 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
  --backend openai-chat \
  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dataset-name sharegpt \
@@ -716,7 +788,7 @@ python -m vllm.entrypoints.openai.api_server \
 Send requests with videos:
 ```bash
-python benchmarks/benchmark_serving.py \
+vllm bench serve \
  --backend openai-chat \
  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dataset-name sharegpt \

--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@@ -31,6 +31,12 @@ cd vllm
 You must set the following variables at the top of the script before execution.
+   Note: You can also override the default values below via environment variables when running the script.
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
 | Variable | Description | Example Value |
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |

--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@@ -5,25 +5,41 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-BASE="$SCRIPT_DIR/../../.."
+VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
-MODEL="meta-llama/Llama-3.1-8B-Instruct"
+BASE=${BASE:-"$SCRIPT_DIR/../../.."}
-SYSTEM="TPU"
+MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
-TP=1
+SYSTEM=${SYSTEM:-"TPU"}
-DOWNLOAD_DIR=""
+TP=${TP:-1}
-INPUT_LEN=4000
+DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
-OUTPUT_LEN=16
+INPUT_LEN=${INPUT_LEN:-4000}
-MAX_MODEL_LEN=4096
+OUTPUT_LEN=${OUTPUT_LEN:-16}
-MIN_CACHE_HIT_PCT=0
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
-MAX_LATENCY_ALLOWED_MS=100000000000
+MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
-NUM_SEQS_LIST="128 256"
+MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
-NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
+NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 PROFILE_PATH="$LOG_FOLDER/profile"
-echo "result file: $RESULT"
+echo "====================== AUTO TUNE PARAMETERS ===================="
-echo "model: $MODEL"
+echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "BASE=$BASE"
+echo "MODEL=$MODEL"
+echo "SYSTEM=$SYSTEM"
+echo "TP=$TP"
+echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
+echo "INPUT_LEN=$INPUT_LEN"
+echo "OUTPUT_LEN=$OUTPUT_LEN"
+echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
+echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
+echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
+echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
+echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
+echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
+echo "RESULT_FILE=$RESULT"
+echo "====================== AUTO TUNEPARAMETERS ===================="
 rm -rf $LOG_FOLDER
 rm -rf $PROFILE_PATH
@@ -213,7 +229,7 @@ run_benchmark() {
    pkill -if vllm
    sleep 10
-    printf '=%.0s' $(seq 1 20)
+    echo "===================="
    return 0
 }

--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@@ -57,7 +57,7 @@ def invoke_main() -> None:
        "--num-iteration",
        type=int,
        default=1000,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
    )
    parser.add_argument(
        "--allocate-blocks",

--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset):
            # [6880, 6881] -> ['Ġcalls', 'here'] ->
            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
            # To avoid uncontrolled change of the prompt length,
-            # the encoded sequence is truncated before being decode again.
+            # the encoded sequence is truncated before being decoded again.
            total_input_len = prefix_len + int(input_lens[i])
            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
                :total_input_len

--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Benchmark the latency of processing a single batch of requests."""
+import sys
-import argparse
+if __name__ == "__main__":
-import dataclasses
+    print("""DEPRECATED: This script has been moved to the vLLM CLI.
-import json
-import os
-import time
-from typing import Any, Optional
-import numpy as np
-from tqdm import tqdm
-from typing_extensions import deprecated
-import vllm.envs as envs
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.sampling_params import BeamSearchParams
-from vllm.utils import FlexibleArgumentParser
-def save_to_pytorch_benchmark_format(
-    args: argparse.Namespace, results: dict[str, Any]
-) -> None:
-    pt_records = convert_to_pytorch_benchmark_format(
-        args=args,
-        metrics={"latency": results["latencies"]},
-        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
-    )
-    if pt_records:
-        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
-        write_to_json(pt_file, pt_records)
-@deprecated(
-    "benchmark_latency.py is deprecated and will be removed in a "
-    "future version. Please use 'vllm bench latency' instead.",
-)
-def main(args: argparse.Namespace):
-    print(args)
-    engine_args = EngineArgs.from_cli_args(args)
-    # NOTE(woosuk): If the request cannot be processed in a single batch,
-    # the engine will automatically process the request in multiple batches.
-    llm = LLM(**dataclasses.asdict(engine_args))
-    assert llm.llm_engine.model_config.max_model_len >= (
-        args.input_len + args.output_len
-    ), (
-        "Please ensure that max_model_len is greater than"
-        " the sum of input_len and output_len."
-    )
-    sampling_params = SamplingParams(
-        n=args.n,
-        temperature=1.0,
-        top_p=1.0,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-        detokenize=not args.disable_detokenize,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(
-        10000, size=(args.batch_size, args.input_len)
-    )
-    dummy_prompts: list[PromptType] = [
-        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
-    ]
-    def llm_generate():
-        if not args.use_beam_search:
-            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
-        else:
-            llm.beam_search(
-                dummy_prompts,
-                BeamSearchParams(
-                    beam_width=args.n,
-                    max_tokens=args.output_len,
-                    ignore_eos=True,
-                ),
-            )
-    def run_to_completion(profile_dir: Optional[str] = None):
-        if profile_dir:
-            llm.start_profile()
-            llm_generate()
-            llm.stop_profile()
-        else:
-            start_time = time.perf_counter()
-            llm_generate()
-            end_time = time.perf_counter()
-            latency = end_time - start_time
-            return latency
-    print("Warming up...")
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion(profile_dir=None)
-    if args.profile:
-        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
-        print(f"Profiling (results will be saved to '{profile_dir}')...")
-        run_to_completion(profile_dir=profile_dir)
-        return
-    # Benchmark.
-    latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_dir=None))
-    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90, 99]
-    percentiles = np.percentile(latencies, percentages)
-    print(f"Avg latency: {np.mean(latencies)} seconds")
-    for percentage, percentile in zip(percentages, percentiles):
-        print(f"{percentage}% percentile latency: {percentile} seconds")
-    # Output JSON results if specified
-    if args.output_json:
-        results = {
-            "avg_latency": np.mean(latencies),
-            "latencies": latencies.tolist(),
-            "percentiles": dict(zip(percentages, percentiles.tolist())),
-        }
-        with open(args.output_json, "w") as f:
-            json.dump(results, f, indent=4)
-        save_to_pytorch_benchmark_format(args, results)
-def create_argument_parser():
-    parser = FlexibleArgumentParser(
-        description="Benchmark the latency of processing a single batch of "
-        "requests till completion."
-    )
-    parser.add_argument("--input-len", type=int, default=32)
-    parser.add_argument("--output-len", type=int, default=128)
-    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument(
-        "--n",
-        type=int,
-        default=1,
-        help="Number of generated sequences per prompt.",
-    )
-    parser.add_argument("--use-beam-search", action="store_true")
-    parser.add_argument(
-        "--num-iters-warmup",
-        type=int,
-        default=10,
-        help="Number of iterations to run for warmup.",
-    )
-    parser.add_argument(
-        "--num-iters", type=int, default=30, help="Number of iterations to run."
-    )
-    parser.add_argument(
-        "--profile",
-        action="store_true",
-        help="profile the generation process of a single batch",
-    )
-    parser.add_argument(
-        "--output-json",
-        type=str,
-        default=None,
-        help="Path to save the latency results in JSON format.",
-    )
-    parser.add_argument(
-        "--disable-detokenize",
-        action="store_true",
-        help=(
-            "Do not detokenize responses (i.e. do not include "
-            "detokenization time in the latency measurement)"
-        ),
-    )
-    parser = EngineArgs.add_cli_args(parser)
-    # V1 enables prefix caching by default which skews the latency
-    # numbers. We need to disable prefix caching by default.
-    parser.set_defaults(enable_prefix_caching=False)
-    return parser
+Please use the following command instead:
+    vllm bench latency
+For help with the new command, run:
+    vllm bench latency --help
-if __name__ == "__main__":
+Alternatively, you can run the new command directly with:
-    parser = create_argument_parser()
+    python -m vllm.entrypoints.cli.main bench latency --help
-    args = parser.parse_args()
+""")
-    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+    sys.exit(1)
-        raise OSError(
-            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
-            "Please set it to a valid path to use torch profiler."
-        )
-    main(args)
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@@ -77,7 +77,7 @@ def invoke_main() -> None:
        "--num-iteration",
        type=int,
        default=100,
-        help="Number of iterations to run to stablize final data readings",
+        help="Number of iterations to run to stabilize final data readings",
    )
    parser.add_argument(
        "--num-req", type=int, default=128, help="Number of requests in the batch"

--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -998,7 +998,7 @@ def create_argument_parser():
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
        "This argument specifies the metrics to report percentiles. "
        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
        'Default value is "ttft,tpot,itl".',

--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -62,7 +62,7 @@ benchmark() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
  CUDA_VISIBLE_DEVICES=1 python3 \
@@ -72,7 +72,7 @@ benchmark() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
  wait_for_server 8100
  wait_for_server 8200

--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -69,7 +69,7 @@ launch_disagg_prefill() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
@@ -78,7 +78,7 @@ launch_disagg_prefill() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
  wait_for_server 8100
  wait_for_server 8200

--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -4,7 +4,10 @@
 import torch
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    w8a8_block_fp8_matmul,
+    apply_w8a8_block_fp8_linear,
+)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    CUTLASS_BLOCK_FP8_SUPPORTED,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton as vllm_triton
@@ -29,7 +32,7 @@ DEEPSEEK_V3_SHAPES = [
 ]
-def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
    """Build runner function for w8a8 block fp8 matmul."""
    factor_for_scale = 1e-2
@@ -37,37 +40,54 @@ def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
    fp8_max, fp8_min = fp8_info.max, fp8_info.min
    # Create random FP8 tensors
-    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
-    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
-    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
-    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
    # Create scales
    block_n, block_k = block_size[0], block_size[1]
    n_tiles = (N + block_n - 1) // block_n
    k_tiles = (K + block_k - 1) // block_k
-    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
    Bs = (
        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
        * factor_for_scale
    )
+    # SM90 CUTLASS requires row-major format for scales
+    if use_cutlass and current_platform.is_device_capability(90):
+        Bs = Bs.T.contiguous()
    def run():
-        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+        if use_cutlass:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
+            )
+        else:
+            return apply_w8a8_block_fp8_linear(
+                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
+            )
    return run
+# Determine available providers
+available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
+plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
+if CUTLASS_BLOCK_FP8_SUPPORTED:
+    available_providers.append("w8a8-block-fp8-cutlass")
 @vllm_triton.testing.perf_report(
    vllm_triton.testing.Benchmark(
        x_names=["batch_size"],
        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
        x_log=False,
        line_arg="provider",
-        line_vals=["torch-bf16", "w8a8-block-fp8"],
+        line_vals=available_providers,
-        line_names=["torch-bf16", "w8a8-block-fp8"],
+        line_names=available_providers,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
        args={},
@@ -85,11 +105,22 @@ def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
        )
-    else:  # w8a8-block-fp8
+    elif provider == "w8a8-block-fp8-triton":
-        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+        run_w8a8_triton = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=False
+        )
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8_triton(), quantiles=quantiles
+        )
+    elif provider == "w8a8-block-fp8-cutlass":
+        run_w8a8_cutlass = build_w8a8_block_fp8_runner(
+            M, N, K, block_size, device, use_cutlass=True
+        )
        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
-            lambda: run_w8a8(), quantiles=quantiles
+            lambda: run_w8a8_cutlass(), quantiles=quantiles
        )
+    else:
+        raise ValueError(f"Unknown provider: {provider}")
    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)

--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -637,7 +637,7 @@ def bench_optype(
    # Clear LoRA optimization hash-maps.
    _LORA_A_PTR_DICT.clear()
    _LORA_B_PTR_DICT.clear()
-    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
    torch.cuda.synchronize()