raw_vllm

fbeb8a6f · raojy · 2ca8867f · fbeb8a6f · fbeb8a6f · fbeb8a6f
Commit fbeb8a6f authored Mar 27, 2026 by raojy
20 changed files
--- a/benchmarks/attention_benchmarks/__init__.py
+++ b/benchmarks/attention_benchmarks/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""vLLM Attention Benchmarking Suite."""
+from .batch_spec import (
+    BatchRequest,
+    format_batch_spec,
+    get_batch_stats,
+    parse_batch_spec,
+    reorder_for_flashinfer,
+    split_by_type,
+)
+from .common import (
+    BenchmarkConfig,
+    BenchmarkResult,
+    MockLayer,
+    ResultsFormatter,
+    get_attention_scale,
+    is_mla_backend,
+    setup_mla_dims,
+)
+__all__ = [
+    # Batch specification
+    "BatchRequest",
+    "parse_batch_spec",
+    "format_batch_spec",
+    "reorder_for_flashinfer",
+    "split_by_type",
+    "get_batch_stats",
+    # Benchmarking infrastructure
+    "BenchmarkConfig",
+    "BenchmarkResult",
+    "ResultsFormatter",
+    # Mock objects
+    "MockLayer",
+    # Utilities
+    "setup_mla_dims",
+    "get_attention_scale",
+    "is_mla_backend",
+]
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Simplified batch specification grammar for attention benchmarks.
+Grammar (underscore-separated segments):
+  Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+  - count: Number of identical requests (optional, default=1)
+  - q_len: Query length (number of new tokens)
+  - seq_len: Total sequence length (optional, defaults to q_len for prefill)
+  - 'k' suffix: Multiplies value by 1024
+Common patterns:
+  - Prefill:  q_len == seq_len  (e.g., "q2k" → 2048 new tokens, 2048 seq)
+  - Decode:   q_len == 1        (e.g., "q1s1k" → 1 token, 1024 seq length)
+  - Extend:   q_len < seq_len   (e.g., "q4s1k" → 4 tokens, 1024 seq length)
+Examples:
+  q2k              -> [(2048, 2048)]           # Prefill: 2048 tokens
+  q1s1k            -> [(1, 1024)]              # Decode: 1 token, 1K sequence
+  8q1s1k           -> [(1, 1024)] * 8          # 8 decode requests
+  q4s1k            -> [(4, 1024)]              # 4-token extend (spec decode)
+  2q1k_32q1s1k     -> [(1024, 1024)] * 2 + [(1, 1024)] * 32  # Mixed batch
+  16q4s1k          -> [(4, 1024)] * 16         # 16 spec decode requests
+"""
+from collections import Counter
+from dataclasses import dataclass
+import regex as re
+@dataclass
+class BatchRequest:
+    """Represents a single request in a batch."""
+    q_len: int  # Query length (number of new tokens)
+    kv_len: int  # Total KV cache length
+    @property
+    def is_decode(self) -> bool:
+        """True if this is a decode request (q_len == 1)."""
+        return self.q_len == 1
+    @property
+    def is_prefill(self) -> bool:
+        """True if this is a pure prefill (q_len == kv_len)."""
+        return self.q_len == self.kv_len
+    @property
+    def is_extend(self) -> bool:
+        """True if this is context extension (q_len > 1, kv_len > q_len)."""
+        return self.q_len > 1 and self.kv_len > self.q_len
+    @property
+    def context_len(self) -> int:
+        """Context length (KV cache - query)."""
+        return self.kv_len - self.q_len
+    def as_tuple(self) -> tuple[int, int]:
+        """Return as (q_len, kv_len) tuple for compatibility."""
+        return (self.q_len, self.kv_len)
+def _parse_size(size_str: str, k_suffix: str) -> int:
+    """Parse size string with optional 'k' suffix."""
+    size = int(size_str)
+    return size * 1024 if k_suffix == "k" else size
+def parse_batch_spec(spec: str) -> list[BatchRequest]:
+    """
+    Parse batch specification string into list of BatchRequest objects.
+    Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+    Args:
+        spec: Batch specification string (see module docstring for grammar)
+    Returns:
+        List of BatchRequest objects
+    Raises:
+        ValueError: If spec format is invalid
+    """
+    requests = []
+    for seg in spec.split("_"):
+        # Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
+        if m:
+            cnt = int(m.group(1)) if m.group(1) else 1
+            q_len = _parse_size(m.group(2), m.group(3))
+            kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
+            requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
+            continue
+        raise ValueError(f"Invalid batch spec segment: '{seg}'")
+    return requests
+def format_batch_spec(requests: list[BatchRequest]) -> str:
+    """
+    Format list of BatchRequest into human-readable string.
+    Groups requests by type and provides counts and sizes.
+    Args:
+        requests: List of BatchRequest objects
+    Returns:
+        Formatted string describing the batch
+    """
+    kinds = {
+        "prefill": [],
+        "extend": [],
+        "decode": [],
+    }
+    for req in requests:
+        tup = (req.q_len, req.kv_len)
+        if req.is_prefill:
+            kinds["prefill"].append(tup)
+        elif req.is_extend:
+            kinds["extend"].append(tup)
+        elif req.is_decode:
+            kinds["decode"].append(tup)
+    parts = []
+    for kind in ["prefill", "extend", "decode"]:
+        lst = kinds[kind]
+        if not lst:
+            continue
+        cnt_total = len(lst)
+        ctr = Counter(lst)
+        inner = []
+        for (q, kv), cnt in ctr.items():
+            if kind == "prefill":
+                size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
+                inner.append(f"{cnt}x{size}")
+            elif kind == "decode":
+                size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
+                inner.append(f"{cnt}x{size}")
+            else:  # extend
+                qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
+                kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
+                inner.append(f"{cnt}xq{qstr}kv{kstr}")
+        parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
+    return ", ".join(parts)
+def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
+    """
+    Reorder requests for FlashInfer: decode first, then prefill.
+    FlashInfer expects decode requests before prefill requests for
+    optimal performance.
+    Args:
+        requests: Original list of BatchRequest
+    Returns:
+        Reordered list with decode requests first
+    """
+    decodes = [r for r in requests if r.is_decode]
+    non_decodes = [r for r in requests if not r.is_decode]
+    return decodes + non_decodes
+def split_by_type(
+    requests: list[BatchRequest],
+) -> dict[str, list[BatchRequest]]:
+    """
+    Split requests by type for analysis.
+    Args:
+        requests: List of BatchRequest
+    Returns:
+        Dict with keys: 'decode', 'prefill', 'extend'
+    """
+    result = {
+        "decode": [],
+        "prefill": [],
+        "extend": [],
+    }
+    for req in requests:
+        if req.is_decode:
+            result["decode"].append(req)
+        elif req.is_prefill:
+            result["prefill"].append(req)
+        elif req.is_extend:
+            result["extend"].append(req)
+    return result
+def get_batch_stats(requests: list[BatchRequest]) -> dict:
+    """
+    Compute statistics about a batch.
+    Args:
+        requests: List of BatchRequest
+    Returns:
+        Dict with batch statistics
+    """
+    by_type = split_by_type(requests)
+    return {
+        "total_requests": len(requests),
+        "num_decode": len(by_type["decode"]),
+        "num_prefill": len(by_type["prefill"]),
+        "num_extend": len(by_type["extend"]),
+        "total_tokens": sum(r.q_len for r in requests),
+        "total_kv_cache": sum(r.kv_len for r in requests),
+        "max_q_len": max((r.q_len for r in requests), default=0),
+        "max_kv_len": max((r.kv_len for r in requests), default=0),
+        "avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
+        "avg_kv_len": (
+            sum(r.kv_len for r in requests) / len(requests) if requests else 0
+        ),
+    }
+def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
+    """
+    Classify a batch spec into a type string.
+    Args:
+        batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
+        spec_decode_threshold: Max q_len to be considered spec-decode vs extend
+    Returns:
+        Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
+    """
+    requests = parse_batch_spec(batch_spec)
+    # Classify each request
+    types_present = set()
+    for req in requests:
+        if req.is_decode:
+            types_present.add("decode")
+        elif req.is_prefill:
+            types_present.add("prefill")
+        elif req.is_extend:
+            # Distinguish spec-decode (small q_len) from extend (chunked prefill)
+            if req.q_len <= spec_decode_threshold:
+                types_present.add("spec-decode")
+            else:
+                types_present.add("extend")
+    if len(types_present) == 1:
+        return types_present.pop()
+    elif len(types_present) > 1:
+        # Sort for consistent output
+        sorted_types = sorted(types_present)
+        return f"mixed ({'+'.join(sorted_types)})"
+    else:
+        return "unknown"
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Common utilities for attention benchmarking."""
+import csv
+import json
+import math
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+import torch
+from batch_spec import get_batch_type, parse_batch_spec
+from rich.console import Console
+from rich.table import Table
+def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
+    """
+    Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
+    This ensures results are sorted by batch size first, then query length,
+    then sequence length, rather than alphabetically.
+    """
+    try:
+        requests = parse_batch_spec(spec)
+        batch_size = len(requests)
+        max_q_len = max(r.q_len for r in requests) if requests else 0
+        max_kv_len = max(r.kv_len for r in requests) if requests else 0
+        return (batch_size, max_q_len, max_kv_len)
+    except Exception:
+        # Fallback for unparseable specs
+        return (0, 0, 0)
+# Mock classes for vLLM attention infrastructure
+class MockHfConfig:
+    """Mock HuggingFace config that satisfies vLLM's requirements."""
+    def __init__(self, mla_dims: dict, index_topk: int | None = None):
+        self.num_attention_heads = mla_dims["num_q_heads"]
+        self.num_key_value_heads = mla_dims["num_kv_heads"]
+        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
+        self.model_type = "deepseek_v2"
+        self.is_encoder_decoder = False
+        self.kv_lora_rank = mla_dims["kv_lora_rank"]
+        self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
+        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
+        self.v_head_dim = mla_dims["v_head_dim"]
+        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
+        if index_topk is not None:
+            self.index_topk = index_topk
+    def get_text_config(self):
+        return self
+# Import AttentionLayerBase at module level to avoid circular dependencies
+try:
+    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+except ImportError:
+    AttentionLayerBase = object  # Fallback
+class MockKVBProj:
+    """Mock KV projection layer for MLA prefill mode.
+    Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
+    Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
+    """
+    def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
+        self.num_heads = num_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        self.out_dim = qk_nope_head_dim + v_head_dim
+    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
+        """
+        Project kv_c_normed to output space.
+        Args:
+            x: Input tensor [num_tokens, kv_lora_rank]
+        Returns:
+            Tuple containing output tensor
+                [num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
+        """
+        num_tokens = x.shape[0]
+        result = torch.randn(
+            num_tokens,
+            self.num_heads,
+            self.out_dim,
+            device=x.device,
+            dtype=x.dtype,
+        )
+        return (result,)  # Return as tuple to match ColumnParallelLinear API
+class MockIndexer:
+    """Mock Indexer for sparse MLA backends.
+    Provides topk_indices_buffer that sparse MLA backends use to determine
+    which KV cache slots to attend to for each token.
+    """
+    def __init__(
+        self,
+        max_num_tokens: int,
+        topk_tokens: int,
+        device: torch.device,
+    ):
+        self.topk_tokens = topk_tokens
+        self.topk_indices_buffer = torch.zeros(
+            (max_num_tokens, topk_tokens),
+            dtype=torch.int32,
+            device=device,
+        )
+    def fill_random_indices(self, num_tokens: int, max_kv_len: int):
+        """Fill topk_indices_buffer with random valid indices for benchmarking."""
+        indices = torch.randint(
+            0,
+            max_kv_len,
+            (num_tokens, self.topk_tokens),
+            dtype=torch.int32,
+            device=self.topk_indices_buffer.device,
+        )
+        self.topk_indices_buffer[:num_tokens] = indices
+class MockLayer(AttentionLayerBase):
+    """Mock attention layer with scale parameters and impl.
+    Inherits from AttentionLayerBase so it passes isinstance checks
+    in get_layers_from_vllm_config when FlashInfer prefill is enabled.
+    """
+    def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
+        # Don't call super().__init__() as AttentionLayerBase doesn't have __init__
+        self._k_scale = torch.tensor(1.0, device=device)
+        self._v_scale = torch.tensor(1.0, device=device)
+        self._q_scale = torch.tensor(1.0, device=device)
+        # Scalar floats for kernels that need them
+        self._k_scale_float = float(self._k_scale.item())
+        self._v_scale_float = float(self._v_scale.item())
+        self._q_scale_float = float(self._q_scale.item())
+        # AttentionImpl for metadata builders to query
+        self.impl = impl
+        # KV cache spec for get_kv_cache_spec
+        self._kv_cache_spec = kv_cache_spec
+    def get_attn_backend(self):
+        """Get the attention backend class (required by AttentionLayerBase)."""
+        # Return None as this is just a mock layer for benchmarking
+        return None
+    def get_kv_cache_spec(self):
+        """Get the KV cache spec (required by AttentionLayerBase)."""
+        return self._kv_cache_spec
+@dataclass
+class ParameterSweep:
+    """Configuration for sweeping a backend parameter."""
+    param_name: str  # Name of the backend parameter to sweep
+    values: list[Any]  # List of values to test
+    include_auto: bool = False  # Also test with param unset (auto mode)
+    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
+    def get_label(self, backend: str, value: Any) -> str:
+        """Generate a label for a specific parameter value."""
+        return self.label_format.format(
+            backend=backend, param_name=self.param_name, value=value
+        )
+@dataclass
+class ModelParameterSweep:
+    """Configuration for sweeping a model configuration parameter."""
+    param_name: str  # Name of the model config parameter to sweep (e.g., "num_q_heads")
+    values: list[Any]  # List of values to test
+    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
+    def get_label(self, backend: str, value: Any) -> str:
+        """Generate a label for a specific parameter value."""
+        return self.label_format.format(
+            backend=backend, param_name=self.param_name, value=value
+        )
+@dataclass
+class BenchmarkConfig:
+    """Configuration for a single benchmark run."""
+    backend: str
+    batch_spec: str
+    num_layers: int
+    head_dim: int
+    num_q_heads: int
+    num_kv_heads: int
+    block_size: int
+    device: str
+    dtype: torch.dtype = torch.float16
+    repeats: int = 1
+    warmup_iters: int = 3
+    profile_memory: bool = False
+    use_cuda_graphs: bool = False
+    # MLA-specific
+    kv_lora_rank: int | None = None
+    qk_nope_head_dim: int | None = None
+    qk_rope_head_dim: int | None = None
+    v_head_dim: int | None = None
+    # Backend-specific tuning
+    num_kv_splits: int | None = None  # CUTLASS MLA
+    reorder_batch_threshold: int | None = None  # FlashAttn MLA, FlashMLA
+@dataclass
+class BenchmarkResult:
+    """Results from a single benchmark run."""
+    config: BenchmarkConfig
+    mean_time: float  # seconds
+    std_time: float  # seconds
+    min_time: float  # seconds
+    max_time: float  # seconds
+    throughput_tokens_per_sec: float | None = None
+    memory_allocated_mb: float | None = None
+    memory_reserved_mb: float | None = None
+    error: str | None = None
+    @property
+    def success(self) -> bool:
+        """Whether benchmark completed successfully."""
+        return self.error is None
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "config": asdict(self.config),
+            "mean_time": self.mean_time,
+            "std_time": self.std_time,
+            "min_time": self.min_time,
+            "max_time": self.max_time,
+            "throughput_tokens_per_sec": self.throughput_tokens_per_sec,
+            "memory_allocated_mb": self.memory_allocated_mb,
+            "memory_reserved_mb": self.memory_reserved_mb,
+            "error": self.error,
+        }
+class ResultsFormatter:
+    """Format and display benchmark results."""
+    def __init__(self, console: Console | None = None):
+        self.console = console or Console()
+    def print_table(
+        self,
+        results: list[BenchmarkResult],
+        backends: list[str],
+        compare_to_fastest: bool = True,
+    ):
+        """
+        Print results as a rich table.
+        Args:
+            results: List of BenchmarkResult
+            backends: List of backend names being compared
+            compare_to_fastest: Show percentage comparison to fastest
+        """
+        # Group by batch spec, preserving first-occurrence order
+        by_spec = {}
+        specs_order = []
+        for r in results:
+            spec = r.config.batch_spec
+            if spec not in by_spec:
+                by_spec[spec] = {}
+                specs_order.append(spec)
+            by_spec[spec][r.config.backend] = r
+        # Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
+        specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
+        # Create shortened backend names for display
+        def shorten_backend_name(name: str) -> str:
+            """Shorten long backend names for table display."""
+            # Remove common prefixes
+            name = name.replace("flashattn_mla", "famla")
+            name = name.replace("flashinfer_mla", "fimla")
+            name = name.replace("flashmla", "fmla")
+            name = name.replace("cutlass_mla", "cmla")
+            name = name.replace("numsplits", "ns")
+            return name
+        table = Table(title="Attention Benchmark Results")
+        table.add_column("Batch\nSpec", no_wrap=True)
+        table.add_column("Type", no_wrap=True)
+        table.add_column("Batch\nSize", justify="right", no_wrap=True)
+        multi = len(backends) > 1
+        for backend in backends:
+            short_name = shorten_backend_name(backend)
+            # Time column
+            col_time = f"{short_name}\nTime (s)"
+            table.add_column(col_time, justify="right", no_wrap=False)
+            if multi and compare_to_fastest:
+                # Relative performance column
+                col_rel = f"{short_name}\nvs Best"
+                table.add_column(col_rel, justify="right", no_wrap=False)
+        # Add rows
+        for spec in specs_order:
+            spec_results = by_spec[spec]
+            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
+            best_time = min(times.values()) if times else 0.0
+            batch_type = get_batch_type(spec)
+            batch_size = len(parse_batch_spec(spec))
+            row = [spec, batch_type, str(batch_size)]
+            for backend in backends:
+                if backend in spec_results:
+                    r = spec_results[backend]
+                    if r.success:
+                        row.append(f"{r.mean_time:.6f}")
+                        if multi and compare_to_fastest:
+                            pct = (
+                                (r.mean_time / best_time * 100) if best_time > 0 else 0
+                            )
+                            pct_str = f"{pct:.1f}%"
+                            if r.mean_time == best_time:
+                                pct_str = f"[bold green]{pct_str}[/]"
+                            row.append(pct_str)
+                    else:
+                        row.append("[red]ERROR[/]")
+                        if multi and compare_to_fastest:
+                            row.append("-")
+                else:
+                    row.append("-")
+                    if multi and compare_to_fastest:
+                        row.append("-")
+            table.add_row(*row)
+        self.console.print(table)
+    def save_csv(self, results: list[BenchmarkResult], path: str):
+        """Save results to CSV file."""
+        if not results:
+            return
+        path_obj = Path(path)
+        path_obj.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "w", newline="") as f:
+            writer = csv.DictWriter(
+                f,
+                fieldnames=[
+                    "backend",
+                    "batch_spec",
+                    "num_layers",
+                    "mean_time",
+                    "std_time",
+                    "throughput",
+                    "memory_mb",
+                ],
+            )
+            writer.writeheader()
+            for r in results:
+                writer.writerow(
+                    {
+                        "backend": r.config.backend,
+                        "batch_spec": r.config.batch_spec,
+                        "num_layers": r.config.num_layers,
+                        "mean_time": r.mean_time,
+                        "std_time": r.std_time,
+                        "throughput": r.throughput_tokens_per_sec or 0,
+                        "memory_mb": r.memory_allocated_mb or 0,
+                    }
+                )
+        self.console.print(f"[green]Saved CSV results to {path}[/]")
+    def save_json(self, results: list[BenchmarkResult], path: str):
+        """Save results to JSON file."""
+        path_obj = Path(path)
+        path_obj.parent.mkdir(parents=True, exist_ok=True)
+        data = [r.to_dict() for r in results]
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+        self.console.print(f"[green]Saved JSON results to {path}[/]")
+def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
+    """
+    Get MLA dimensions for known models.
+    Args:
+        model_name: Model identifier
+    Returns:
+        Dict with MLA dimension configuration
+    """
+    configs = {
+        "deepseek-v2": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 128,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+        "deepseek-v3": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 128,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+        "deepseek-v2-lite": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 16,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+    }
+    if model_name not in configs:
+        raise ValueError(
+            f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
+        )
+    return configs[model_name]
+def get_attention_scale(head_dim: int) -> float:
+    """Compute attention scale factor (1/sqrt(d))."""
+    return 1.0 / math.sqrt(head_dim)
+def is_mla_backend(backend: str) -> bool:
+    """
+    Check if backend is an MLA backend using the AttentionBackendEnum.
+    Args:
+        backend: Backend name matching AttentionBackendEnum exactly
+        (e.g., "FLASHMLA_SPARSE")
+    Returns:
+        True if the backend is an MLA backend, False otherwise
+    """
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+    try:
+        backend_enum = AttentionBackendEnum[backend]
+        backend_class = backend_enum.get_class()
+        return backend_class.is_mla()
+    except (KeyError, ValueError, ImportError, AttributeError):
+        return False
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+# MLA decode-only benchmark configuration
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128  # Base value, can be swept for TP simulation
+  num_kv_heads: 1  # MLA uses single latent KV
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+batch_specs:
+  # Small batches, varying sequence lengths
+  - "16q1s512"     # 16 requests, 512 KV cache
+  - "16q1s1k"      # 16 requests, 1k KV cache
+  - "16q1s2k"      # 16 requests, 2k KV cache
+  - "16q1s4k"      # 16 requests, 4k KV cache
+  # Medium batches
+  - "32q1s1k"      # 32 requests, 1k KV cache
+  - "32q1s2k"      # 32 requests, 2k KV cache
+  - "32q1s4k"      # 32 requests, 4k KV cache
+  - "32q1s8k"      # 32 requests, 8k KV cache
+  # Large batches
+  - "64q1s1k"      # 64 requests, 1k KV cache
+  - "64q1s2k"      # 64 requests, 2k KV cache
+  - "64q1s4k"      # 64 requests, 4k KV cache
+  - "64q1s8k"      # 64 requests, 8k KV cache
+  # Very large batches
+  - "128q1s1k"     # 128 requests, 1k KV cache
+  - "128q1s2k"     # 128 requests, 2k KV cache
+  - "128q1s4k"     # 128 requests, 4k KV cache
+  - "128q1s8k"     # 128 requests, 8k KV cache
+  # Long context
+  - "32q1s16k"     # 32 requests, 16k KV cache
+  - "32q1s32k"     # 32 requests, 32k KV cache
+backends:
+  - CUTLASS_MLA
+  - FLASHINFER_MLA
+  - FLASH_ATTN_MLA  # Hopper only
+  - FLASHMLA        # Hopper only
+device: "cuda:0"
+repeats: 100
+warmup_iters: 10
+profile_memory: true
+# Backend-specific tuning
+CUTLASS_MLA:
+  num_kv_splits: auto  # or specific value like 4, 8, 16
+FLASH_ATTN_MLA:
+  reorder_batch_threshold: 512
+FLASHMLA:
+  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+# MLA mixed batch benchmark (prefill + decode)
+# Tests chunked prefill performance
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128
+batch_specs:
+  # Small prefill + decode
+  - "1q1k_8q1s1k"           # 1 prefill + 8 decode
+  - "2q2k_16q1s1k"          # 2 prefill + 16 decode
+  - "4q1k_32q1s2k"          # 4 prefill + 32 decode
+  # Medium prefill + decode
+  - "2q4k_32q1s2k"          # 2 medium prefill + 32 decode
+  - "4q4k_64q1s2k"          # 4 medium prefill + 64 decode
+  - "8q2k_64q1s4k"          # 8 prefill + 64 decode
+  # Large prefill + decode (chunked prefill stress test)
+  - "2q8k_32q1s1k"          # 2 large prefill + 32 decode
+  - "1q16k_16q1s2k"         # 1 very large prefill + 16 decode
+  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
+  # Context extension + decode
+  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
+  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
+  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
+  # Explicitly chunked prefill
+  - "q8k"           # 8k prefill with chunking hint
+  - "q16k"          # 16k prefill with chunking hint
+  - "2q8k_32q1s2k"    # 2 chunked prefill + 32 decode
+  # High decode ratio (realistic serving)
+  - "1q2k_63q1s1k"          # 1 prefill + 63 decode
+  - "2q2k_62q1s2k"          # 2 prefill + 62 decode
+  - "4q4k_60q1s4k"          # 4 prefill + 60 decode
+backends:
+  - CUTLASS_MLA
+  - FLASHINFER_MLA
+  - FLASH_ATTN_MLA   # Hopper only
+  - FLASHMLA         # Hopper only
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: true
+# Analyze chunked prefill workspace size impact
+chunked_prefill:
+  test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+# MLA prefill-only benchmark configuration for sparse backends
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+batch_specs:
+  # Pure prefill
+  - "1q512"
+  - "1q1k"
+  - "1q2k"
+  - "1q4k"
+  - "1q8k"
+  # Batched pure prefill
+  - "2q512"
+  - "2q1k"
+  - "2q2k"
+  - "2q4k"
+  - "2q8k"
+  - "4q512"
+  - "4q1k"
+  - "4q2k"
+  - "4q4k"
+  - "4q8k"
+  - "8q512"
+  - "8q1k"
+  - "8q2k"
+  - "8q4k"
+  - "8q8k"
+  # Extend
+  - "1q512s4k"
+  - "1q512s8k"
+  - "1q1ks8k"
+  - "1q2ks8k"
+  - "1q2ks16k"
+  - "1q4ks16k"
+backends:
+  - FLASHMLA_SPARSE
+  - FLASHINFER_MLA_SPARSE
+device: "cuda:0"
+repeats: 10
+warmup_iters: 3
+profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
+# Question: At what query length does prefill pipeline become faster than decode pipeline?
+# Methodology: For each query length, compare decode vs prefill performance to find crossover point
+# Applies to: FlashAttn MLA, FlashMLA
+description: "Decode vs Prefill pipeline crossover analysis"
+# Test FlashAttn MLA
+backend: FLASH_ATTN_MLA
+# Mode: decode_vs_prefill comparison (special sweep mode)
+# For each batch spec, we'll test both decode and prefill pipelines
+mode: "decode_vs_prefill"
+# Query lengths to test (from old benchmark_mla_threshold.py methodology)
+# Each query length will be tested with BOTH decode and prefill pipelines:
+#   - decode: threshold >= query_length (forces decode pipeline)
+#   - prefill: threshold < query_length (forces prefill pipeline)
+#
+# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
+# This tests different query lengths with fixed sequence length context
+#
+# Using batch_spec_ranges for automatic generation:
+batch_spec_ranges:
+  - template: "q{q_len}s1k"
+    q_len:
+      start: 1
+      stop: 16
+      step: 1
+      end_inclusive: false
+  - template: "q{q_len}s1k"
+    q_len:
+      start: 16
+      stop: 64
+      step: 2
+      end_inclusive: false
+  - template: "q{q_len}s1k"
+    q_len:
+      start: 64
+      stop: 1024
+      step: 4
+      end_inclusive: true
+# Batch sizes to test (from old script)
+batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+# Model configuration (DeepSeek V2/V3 defaults)
+model:
+  num_layers: 10
+  head_dim: 576
+  num_q_heads: 128
+  num_kv_heads: 1
+  block_size: 128
+# Benchmark settings
+device: "cuda:0"
+repeats: 15          # More repeats for spec decode variance
+warmup_iters: 5
+profile_memory: false
+# Output
+output:
+  csv: "reorder_threshold_results.csv"
+  json: "reorder_threshold_results.json"
+# Expected outcome (reproduces old benchmark_mla_threshold.py study):
+# - For each batch size, find the crossover point where prefill becomes faster than decode
+# - Show decode vs prefill performance across all query lengths
+# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
+# - Understand how crossover point varies with batch size
+# - Provide data-driven guidance for default threshold value
+#
+# Methodology (from old script):
+# - Each query length tested with BOTH pipelines:
+#     * decode: threshold >= query_length (forces decode pipeline)
+#     * prefill: threshold < query_length (forces prefill pipeline)
+# - Compare which is faster to find crossover point
+#
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+# Speculative decoding benchmark configuration
+# Tests reorder_batch_threshold optimization
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+batch_specs:
+  # Pure speculative decode (K-token verification)
+  - "q2s1k"      # 2-token spec, 1k KV
+  - "q4s1k"      # 4-token spec, 1k KV
+  - "q8s1k"      # 8-token spec, 1k KV
+  - "q16s1k"     # 16-token spec, 1k KV
+  # Speculative with different context lengths
+  - "q4s2k"      # 4-token spec, 2k KV
+  - "q4s4k"      # 4-token spec, 4k KV
+  - "q8s2k"      # 8-token spec, 2k KV
+  - "q8s4k"      # 8-token spec, 4k KV
+  # Mixed: speculative + regular decode
+  - "32q4s1k"                    # 32 spec requests
+  - "16q4s1k_16q1s1k"              # 16 spec + 16 regular
+  - "8q8s2k_24q1s2k"               # 8 spec (8-tok) + 24 regular
+  # Mixed: speculative + prefill + decode
+  - "2q1k_16q4s1k_16q1s1k"         # 2 prefill + 16 spec + 16 decode
+  - "4q2k_32q4s2k_32q1s2k"         # 4 prefill + 32 spec + 32 decode
+  # Large batches with speculation
+  - "64q4s1k"                    # 64 spec requests
+  - "32q8s2k"                    # 32 spec (8-token)
+  - "16q16s4k"                   # 16 spec (16-token)
+# Backends that support query length > 1
+backends:
+  - FLASH_ATTN_MLA    # reorder_batch_threshold = 512
+  - FLASHMLA          # reorder_batch_threshold = 1 (tunable)
+# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
+# - FLASHINFER_MLA
+# Benchmark settings
+device: "cuda:0"
+repeats: 10  # More repeats for statistical significance
+warmup_iters: 5
+profile_memory: false
+# Test these threshold values for optimization
+parameter_sweep:
+  param_name: "reorder_batch_threshold"
+  values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+  include_auto: false
+  label_format: "{backend}_threshold_{value}"
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+# Standard attention backend benchmark configuration
+model:
+  num_layers: 32
+  num_q_heads: 32
+  num_kv_heads: 8  # GQA with 4:1 ratio
+  head_dim: 128
+  block_size: 16
+batch_specs:
+  # Pure prefill
+  - "q512"      # Small prefill (512 tokens)
+  - "q2k"       # Medium prefill (2048 tokens)
+  - "q4k"       # Large prefill (4096 tokens)
+  - "q8k"       # Very large prefill (8192 tokens)
+  # Pure decode
+  - "8q1s1k"      # 8 requests, 1k KV cache each
+  - "16q1s2k"     # 16 requests, 2k KV cache each
+  - "32q1s1k"     # 32 requests, 1k KV cache each
+  - "64q1s4k"     # 64 requests, 4k KV cache each
+  # Mixed prefill/decode
+  - "2q2k_8q1s1k"      # 2 prefill + 8 decode
+  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
+  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
+  # Speculative decode (q <= 8)
+  - "16q2s1k"         # 16 requests, 2 spec tokens, 1k KV cache
+  - "16q4s1k"         # 16 requests, 4 spec tokens, 1k KV cache
+  - "16q8s1k"         # 16 requests, 8 spec tokens, 1k KV cache
+  - "32q4s2k"         # 32 requests, 4 spec tokens, 2k KV cache
+  - "8q8s4k"          # 8 requests, 8 spec tokens, 4k KV cache
+  # Context extension (chunked prefill)
+  - "q1ks2k"          # 1k query, 2k sequence
+  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence
+# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
+backends:
+  - FLASH_ATTN
+  - TRITON_ATTN
+  - FLASHINFER
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: false
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
+# Automated vLLM Server Parameter Tuning
+This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Configuration](#configuration)
+- [How to Run](#how-to-run)
+- [Example Use Cases](#example-use-cases)
+- [Output](#output)
+- [How It Works](#how-it-works)
+## Prerequisites
+Before running the script, please ensure the following steps are completed:
+1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch.
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+# git checkout <your-branch>
+```
+1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions.
+2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible.
+## Configuration
+You must set the following variables at the top of the script before execution.
+   Note: You can also override the default values below via environment variables when running the script.
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
+| Variable | Description | Example Value |
+| --- | --- | --- |
+| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
+| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
+| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
+| `TP` | **Required.** The tensor-parallelism size. | `1` |
+| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
+| `INPUT_LEN` | **Required.** Request input length. | `4000` |
+| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
+| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
+| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
+| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
+| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
+| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` |
+**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`.
+## How to Run
+1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
+2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
+```bash
+cd <FOLDER_OF_THIS_SCRIPT>
+bash auto_tune.sh
+```
+    Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself.
+## Example Use Cases
+Here are a few examples of how to configure the script for different goals:
+### 1. Maximize Throughput (No Latency Constraint)
+- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
+- **Configuration**:
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
+```
+### 2. Maximize Throughput with a Latency Requirement
+- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
+- **Configuration**:
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=500
+```
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
+- **Configuration**:
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
+MIN_CACHE_HIT_PCT=60
+MAX_LATENCY_ALLOWED_MS=500
+```
+## Output
+After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`.
+- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
+    - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
+    - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
+- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
+```text
+# Example result.txt content
+hash:a1b2c3d4...
+max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
+max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500
+...
+best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile
+```
+  If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict.
+- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run.
+## How It Works
+The script follows a systematic process to find the optimal parameters:
+1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing.
+2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists.
+3. **Latency-Aware Throughput Search**: For each parameter combination:
+    - The vLLM server is started.
+    - A benchmark is first run with an infinite request rate (`--request-rate inf`).
+    - If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration.
+    - If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement.
+4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
+5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
+## Batched `auto_tune`
+The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
+### Prerequisites
+- **jq**: This script requires `jq` to parse the JSON configuration file.
+- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
+### How to Run
+1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
+2. **Execute the script**:
+    ```bash
+    bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
+    ```
+    - `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
+    - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
+### Configuration File
+The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
+Here is an example `runs_config.json` with two benchmark configurations:
+```json
+[
+  {
+    "base": "/home/user",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "system": "TPU", # OR GPU
+    "tp": 8,
+    "input_len": 128,
+    "output_len": 2048,
+    "max_model_len": 2300,
+    "num_seqs_list": "128 256",
+    "num_batched_tokens_list": "8192 16384"
+  },
+  {
+    "base": "/home/user",
+    "model": "meta-llama/Llama-3.1-70B-Instruct",
+    "system": "TPU", # OR GPU
+    "tp": 8,
+    "input_len": 4000,
+    "output_len": 16,
+    "max_model_len": 4096,
+    "num_seqs_list": "64 128",
+    "num_batched_tokens_list": "4096 8192",
+    "max_latency_allowed_ms": 500
+  }
+]
+```
+### Output
+The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
+- `run_id`: A unique identifier for the run, derived from the timestamp.
+- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
+- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
+- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
+A summary of successful and failed runs is also printed to the console upon completion.
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
--- a/benchmarks/auto_tune/batch_auto_tune.sh
+++ b/benchmarks/auto_tune/batch_auto_tune.sh
+#!/bin/bash
+INPUT_JSON="$1"
+GCS_PATH="$2" # Optional GCS path for uploading results for each run
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
+if [[ -z "$INPUT_JSON" ]]; then
+  echo "Error: Input JSON file not provided."
+  echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
+  exit 1
+fi
+if [[ ! -f "$INPUT_JSON" ]]; then
+  echo "Error: File not found at '$INPUT_JSON'"
+  exit 1
+fi
+if ! command -v jq &> /dev/null; then
+    echo "Error: 'jq' command not found. Please install jq to process the JSON input."
+    exit 1
+fi
+if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
+    echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
+    exit 1
+fi
+SUCCESS_COUNT=0
+FAILURE_COUNT=0
+FAILED_RUNS=()
+SCRIPT_START_TIME=$(date +%s)
+json_content=$(cat "$INPUT_JSON")
+if ! num_runs=$(echo "$json_content" | jq 'length'); then
+  echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
+  exit 1
+fi
+echo "Found $num_runs benchmark configurations in $INPUT_JSON."
+echo "Starting benchmark runs..."
+echo "--------------------------------------------------"
+for i in $(seq 0 $(($num_runs - 1))); do
+  run_object=$(echo "$json_content" | jq ".[$i]")
+  RUN_START_TIME=$(date +%s)
+  ENV_VARS_ARRAY=()
+  # Dynamically create env vars from the JSON object's keys
+  for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
+    value=$(echo "$run_object" | jq -r ".$key")
+    var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
+    ENV_VARS_ARRAY+=("${var_name}=${value}")
+  done
+  echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
+  # Execute auto_tune.sh and capture output
+  RUN_OUTPUT_FILE=$(mktemp)
+  if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
+    STATUS="SUCCESS"
+    ((SUCCESS_COUNT++))
+  else
+    STATUS="FAILURE"
+    ((FAILURE_COUNT++))
+    FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
+  fi
+  RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
+  rm "$RUN_OUTPUT_FILE"
+  # Parse results and optionally upload them to GCS
+  RUN_ID=""
+  RESULTS=""
+  GCS_RESULTS_URL=""
+  if [[ "$STATUS" == "SUCCESS" ]]; then
+    RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
+    if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
+      RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
+      RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
+      RESULTS=$(cat "$RESULT_FILE_PATH")
+      if [[ -n "$GCS_PATH" ]]; then
+        GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
+        echo "Uploading results to GCS..."
+        if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
+          echo "GCS upload successful."
+        else
+          echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
+        fi
+      fi
+    else
+      echo "Warning: Could not find result file for a successful run."
+      STATUS="WARNING_NO_RESULT_FILE"
+    fi
+  fi
+  # Add the results back into the JSON object for this run
+  json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
+    '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
+  RUN_END_TIME=$(date +%s)
+  echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
+  echo "--------------------------------------------------"
+  # Save intermediate progress back to the file
+  echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
+done
+SCRIPT_END_TIME=$(date +%s)
+echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
+echo
+echo "====================== SUMMARY ======================"
+echo "Successful runs: $SUCCESS_COUNT"
+echo "Failed runs:     $FAILURE_COUNT"
+echo "==================================================="
+if [[ $FAILURE_COUNT -gt 0 ]]; then
+  echo "Details of failed runs (see JSON file for full parameters):"
+  for failed in "${FAILED_RUNS[@]}"; do
+    echo "  - $failed"
+  done
+fi
+echo "Updated results have been saved to '$INPUT_JSON'."
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
--- a/benchmarks/benchmark_batch_invariance.py
+++ b/benchmarks/benchmark_batch_invariance.py
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
--- a/benchmarks/benchmark_hash.py
+++ b/benchmarks/benchmark_hash.py
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import sys
+if __name__ == "__main__":
+    print("""DEPRECATED: This script has been moved to the vLLM CLI.
+Please use the following command instead:
+    vllm bench latency
+For help with the new command, run:
+    vllm bench latency --help
+Alternatively, you can run the new command directly with:
+    python -m vllm.entrypoints.cli.main bench latency --help
+""")
+    sys.exit(1)