raw_vllm

fbeb8a6f · raojy · 2ca8867f · fbeb8a6f · fbeb8a6f · fbeb8a6f
Commit fbeb8a6f authored Mar 27, 2026 by raojy
20 changed files
--- a/benchmarks/attention_benchmarks/__init__.py
+++ b/benchmarks/attention_benchmarks/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""vLLM Attention Benchmarking Suite."""
+
+from .batch_spec import (
+    BatchRequest,
+    format_batch_spec,
+    get_batch_stats,
+    parse_batch_spec,
+    reorder_for_flashinfer,
+    split_by_type,
+)
+from .common import (
+    BenchmarkConfig,
+    BenchmarkResult,
+    MockLayer,
+    ResultsFormatter,
+    get_attention_scale,
+    is_mla_backend,
+    setup_mla_dims,
+)
+
+__all__ = [
+    # Batch specification
+    "BatchRequest",
+    "parse_batch_spec",
+    "format_batch_spec",
+    "reorder_for_flashinfer",
+    "split_by_type",
+    "get_batch_stats",
+    # Benchmarking infrastructure
+    "BenchmarkConfig",
+    "BenchmarkResult",
+    "ResultsFormatter",
+    # Mock objects
+    "MockLayer",
+    # Utilities
+    "setup_mla_dims",
+    "get_attention_scale",
+    "is_mla_backend",
+]
--- a/benchmarks/attention_benchmarks/batch_spec.py
+++ b/benchmarks/attention_benchmarks/batch_spec.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Simplified batch specification grammar for attention benchmarks.
+
+Grammar (underscore-separated segments):
+  Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+
+  - count: Number of identical requests (optional, default=1)
+  - q_len: Query length (number of new tokens)
+  - seq_len: Total sequence length (optional, defaults to q_len for prefill)
+  - 'k' suffix: Multiplies value by 1024
+
+Common patterns:
+  - Prefill:  q_len == seq_len  (e.g., "q2k" → 2048 new tokens, 2048 seq)
+  - Decode:   q_len == 1        (e.g., "q1s1k" → 1 token, 1024 seq length)
+  - Extend:   q_len < seq_len   (e.g., "q4s1k" → 4 tokens, 1024 seq length)
+
+Examples:
+  q2k              -> [(2048, 2048)]           # Prefill: 2048 tokens
+  q1s1k            -> [(1, 1024)]              # Decode: 1 token, 1K sequence
+  8q1s1k           -> [(1, 1024)] * 8          # 8 decode requests
+  q4s1k            -> [(4, 1024)]              # 4-token extend (spec decode)
+  2q1k_32q1s1k     -> [(1024, 1024)] * 2 + [(1, 1024)] * 32  # Mixed batch
+  16q4s1k          -> [(4, 1024)] * 16         # 16 spec decode requests
+"""
+
+from collections import Counter
+from dataclasses import dataclass
+
+import regex as re
+
+
+@dataclass
+class BatchRequest:
+    """Represents a single request in a batch."""
+
+    q_len: int  # Query length (number of new tokens)
+    kv_len: int  # Total KV cache length
+
+    @property
+    def is_decode(self) -> bool:
+        """True if this is a decode request (q_len == 1)."""
+        return self.q_len == 1
+
+    @property
+    def is_prefill(self) -> bool:
+        """True if this is a pure prefill (q_len == kv_len)."""
+        return self.q_len == self.kv_len
+
+    @property
+    def is_extend(self) -> bool:
+        """True if this is context extension (q_len > 1, kv_len > q_len)."""
+        return self.q_len > 1 and self.kv_len > self.q_len
+
+    @property
+    def context_len(self) -> int:
+        """Context length (KV cache - query)."""
+        return self.kv_len - self.q_len
+
+    def as_tuple(self) -> tuple[int, int]:
+        """Return as (q_len, kv_len) tuple for compatibility."""
+        return (self.q_len, self.kv_len)
+
+
+def _parse_size(size_str: str, k_suffix: str) -> int:
+    """Parse size string with optional 'k' suffix."""
+    size = int(size_str)
+    return size * 1024 if k_suffix == "k" else size
+
+
+def parse_batch_spec(spec: str) -> list[BatchRequest]:
+    """
+    Parse batch specification string into list of BatchRequest objects.
+
+    Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+
+    Args:
+        spec: Batch specification string (see module docstring for grammar)
+
+    Returns:
+        List of BatchRequest objects
+
+    Raises:
+        ValueError: If spec format is invalid
+    """
+    requests = []
+
+    for seg in spec.split("_"):
+        # Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
+        m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
+        if m:
+            cnt = int(m.group(1)) if m.group(1) else 1
+            q_len = _parse_size(m.group(2), m.group(3))
+            kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
+            requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
+            continue
+
+        raise ValueError(f"Invalid batch spec segment: '{seg}'")
+
+    return requests
+
+
+def format_batch_spec(requests: list[BatchRequest]) -> str:
+    """
+    Format list of BatchRequest into human-readable string.
+
+    Groups requests by type and provides counts and sizes.
+
+    Args:
+        requests: List of BatchRequest objects
+
+    Returns:
+        Formatted string describing the batch
+    """
+    kinds = {
+        "prefill": [],
+        "extend": [],
+        "decode": [],
+    }
+
+    for req in requests:
+        tup = (req.q_len, req.kv_len)
+        if req.is_prefill:
+            kinds["prefill"].append(tup)
+        elif req.is_extend:
+            kinds["extend"].append(tup)
+        elif req.is_decode:
+            kinds["decode"].append(tup)
+
+    parts = []
+    for kind in ["prefill", "extend", "decode"]:
+        lst = kinds[kind]
+        if not lst:
+            continue
+
+        cnt_total = len(lst)
+        ctr = Counter(lst)
+        inner = []
+
+        for (q, kv), cnt in ctr.items():
+            if kind == "prefill":
+                size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
+                inner.append(f"{cnt}x{size}")
+            elif kind == "decode":
+                size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
+                inner.append(f"{cnt}x{size}")
+            else:  # extend
+                qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
+                kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
+                inner.append(f"{cnt}xq{qstr}kv{kstr}")
+
+        parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
+
+    return ", ".join(parts)
+
+
+def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
+    """
+    Reorder requests for FlashInfer: decode first, then prefill.
+
+    FlashInfer expects decode requests before prefill requests for
+    optimal performance.
+
+    Args:
+        requests: Original list of BatchRequest
+
+    Returns:
+        Reordered list with decode requests first
+    """
+    decodes = [r for r in requests if r.is_decode]
+    non_decodes = [r for r in requests if not r.is_decode]
+    return decodes + non_decodes
+
+
+def split_by_type(
+    requests: list[BatchRequest],
+) -> dict[str, list[BatchRequest]]:
+    """
+    Split requests by type for analysis.
+
+    Args:
+        requests: List of BatchRequest
+
+    Returns:
+        Dict with keys: 'decode', 'prefill', 'extend'
+    """
+    result = {
+        "decode": [],
+        "prefill": [],
+        "extend": [],
+    }
+
+    for req in requests:
+        if req.is_decode:
+            result["decode"].append(req)
+        elif req.is_prefill:
+            result["prefill"].append(req)
+        elif req.is_extend:
+            result["extend"].append(req)
+
+    return result
+
+
+def get_batch_stats(requests: list[BatchRequest]) -> dict:
+    """
+    Compute statistics about a batch.
+
+    Args:
+        requests: List of BatchRequest
+
+    Returns:
+        Dict with batch statistics
+    """
+    by_type = split_by_type(requests)
+
+    return {
+        "total_requests": len(requests),
+        "num_decode": len(by_type["decode"]),
+        "num_prefill": len(by_type["prefill"]),
+        "num_extend": len(by_type["extend"]),
+        "total_tokens": sum(r.q_len for r in requests),
+        "total_kv_cache": sum(r.kv_len for r in requests),
+        "max_q_len": max((r.q_len for r in requests), default=0),
+        "max_kv_len": max((r.kv_len for r in requests), default=0),
+        "avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
+        "avg_kv_len": (
+            sum(r.kv_len for r in requests) / len(requests) if requests else 0
+        ),
+    }
+
+
+def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
+    """
+    Classify a batch spec into a type string.
+
+    Args:
+        batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
+        spec_decode_threshold: Max q_len to be considered spec-decode vs extend
+
+    Returns:
+        Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
+    """
+    requests = parse_batch_spec(batch_spec)
+
+    # Classify each request
+    types_present = set()
+    for req in requests:
+        if req.is_decode:
+            types_present.add("decode")
+        elif req.is_prefill:
+            types_present.add("prefill")
+        elif req.is_extend:
+            # Distinguish spec-decode (small q_len) from extend (chunked prefill)
+            if req.q_len <= spec_decode_threshold:
+                types_present.add("spec-decode")
+            else:
+                types_present.add("extend")
+
+    if len(types_present) == 1:
+        return types_present.pop()
+    elif len(types_present) > 1:
+        # Sort for consistent output
+        sorted_types = sorted(types_present)
+        return f"mixed ({'+'.join(sorted_types)})"
+    else:
+        return "unknown"
--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Universal vLLM Attention Benchmark
+
+Benchmark any attention backend with the extended grammar.
+Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
+
+Examples:
+    # Standard attention
+    python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
+
+    # MLA backends
+    python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
+
+    # Parameter sweep (CLI)
+    python benchmark.py --backend cutlass_mla \
+                        --batch-specs "64q1s1k" \
+                        --sweep-param num_kv_splits \
+                        --sweep-values 1 4 8 16
+
+    # Parameter sweep (YAML config - recommended)
+    python benchmark.py --config configs/cutlass_numsplits.yaml
+"""
+
+import argparse
+import sys
+from dataclasses import replace
+from pathlib import Path
+
+import yaml
+from rich.console import Console
+from tqdm import tqdm
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from batch_spec import parse_batch_spec
+from common import (
+    BenchmarkConfig,
+    BenchmarkResult,
+    ModelParameterSweep,
+    ParameterSweep,
+    ResultsFormatter,
+    batch_spec_sort_key,
+    is_mla_backend,
+)
+
+
+def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
+    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
+    from runner import run_attention_benchmark
+
+    return run_attention_benchmark(config)
+
+
+def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
+    """Run MLA benchmark with appropriate backend."""
+    from mla_runner import run_mla_benchmark as run_mla
+
+    return run_mla(config.backend, config, **kwargs)
+
+
+def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
+    """
+    Run a single benchmark with proper backend selection.
+
+    Args:
+        config: BenchmarkConfig with backend, batch_spec, and model params
+        **kwargs: Additional arguments passed to MLA benchmarks
+
+    Returns:
+        BenchmarkResult (may have error field set on failure)
+    """
+    try:
+        if is_mla_backend(config.backend):
+            return run_mla_benchmark(config, **kwargs)
+        else:
+            return run_standard_attention_benchmark(config)
+    except Exception as e:
+        return BenchmarkResult(
+            config=config,
+            mean_time=float("inf"),
+            std_time=0,
+            min_time=float("inf"),
+            max_time=float("inf"),
+            error=str(e),
+        )
+
+
+def run_model_parameter_sweep(
+    backends: list[str],
+    batch_specs: list[str],
+    base_config_args: dict,
+    sweep: ModelParameterSweep,
+    console: Console,
+) -> list[BenchmarkResult]:
+    """
+    Run model parameter sweep for given backends and batch specs.
+
+    Args:
+        backends: List of backend names
+        batch_specs: List of batch specifications
+        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
+        sweep: ModelParameterSweep configuration
+        console: Rich console for output
+
+    Returns:
+        List of BenchmarkResult objects
+    """
+    all_results = []
+
+    console.print(
+        f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
+    )
+
+    total = len(backends) * len(batch_specs) * len(sweep.values)
+
+    with tqdm(total=total, desc="Benchmarking") as pbar:
+        for backend in backends:
+            for spec in batch_specs:
+                for value in sweep.values:
+                    # Create config with modified model parameter
+                    config_args = base_config_args.copy()
+                    config_args[sweep.param_name] = value
+
+                    # Create config with original backend for running
+                    clean_config = BenchmarkConfig(
+                        backend=backend, batch_spec=spec, **config_args
+                    )
+
+                    # Run benchmark
+                    result = run_benchmark(clean_config)
+
+                    # Replace backend with labeled version for display
+                    backend_label = sweep.get_label(backend, value)
+                    labeled_config = replace(result.config, backend=backend_label)
+                    result = replace(result, config=labeled_config)
+                    all_results.append(result)
+
+                    if not result.success:
+                        console.print(
+                            f"[red]Error {backend} {spec} {sweep.param_name}="
+                            f"{value}: {result.error}[/]"
+                        )
+
+                    pbar.update(1)
+
+    # Display sweep results - create separate table for each parameter value
+    console.print("\n[bold green]Model Parameter Sweep Results:[/]")
+    formatter = ResultsFormatter(console)
+
+    # Group results by parameter value and extract backend mapping
+    by_param_value = {}
+    backend_mapping = {}  # Maps labeled backend -> original backend
+
+    for r in all_results:
+        # Extract original backend and param value from labeled backend
+        # The label format is: {backend}_{param_name}_{value}
+        # We need to reverse engineer this
+        labeled_backend = r.config.backend
+
+        # Try each backend to find which one this result belongs to
+        for backend in backends:
+            for value in sweep.values:
+                expected_label = sweep.get_label(backend, value)
+                if labeled_backend == expected_label:
+                    backend_mapping[labeled_backend] = backend
+                    param_value = str(value)
+
+                    if param_value not in by_param_value:
+                        by_param_value[param_value] = []
+                    by_param_value[param_value].append(r)
+                    break
+
+    # Create a table for each parameter value
+    sorted_param_values = sorted(
+        by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
+    )
+
+    for param_value in sorted_param_values:
+        console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
+        param_results = by_param_value[param_value]
+
+        # Create modified results with original backend names
+        modified_results = []
+        for r in param_results:
+            # Get the original backend name from our mapping
+            original_backend = backend_mapping[r.config.backend]
+            modified_config = replace(r.config, backend=original_backend)
+            modified_result = replace(r, config=modified_config)
+            modified_results.append(modified_result)
+
+        # Print table with original backend names
+        formatter.print_table(modified_results, backends, compare_to_fastest=True)
+
+    # Show optimal backend for each (param_value, batch_spec) combination
+    console.print(
+        f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
+    )
+
+    # Group by (param_value, batch_spec)
+    by_param_and_spec = {}
+    for r in all_results:
+        if r.success:
+            # Find which (backend, value) this result corresponds to
+            labeled_backend = r.config.backend
+            for backend in backends:
+                for value in sweep.values:
+                    expected_label = sweep.get_label(backend, value)
+                    if labeled_backend == expected_label:
+                        param_value = str(value)
+                        spec = r.config.batch_spec
+                        key = (param_value, spec)
+
+                        if key not in by_param_and_spec:
+                            by_param_and_spec[key] = []
+                        by_param_and_spec[key].append(r)
+                        break
+
+    # Sort by param value then spec (batch_size, q_len, kv_len)
+    sorted_keys = sorted(
+        by_param_and_spec.keys(),
+        key=lambda x: (
+            int(x[0]) if x[0].isdigit() else x[0],
+            batch_spec_sort_key(x[1]),
+        ),
+    )
+
+    current_param_value = None
+    for param_value, spec in sorted_keys:
+        # Print header when param value changes
+        if param_value != current_param_value:
+            console.print(f"\n  [bold]{sweep.param_name}={param_value}:[/]")
+            current_param_value = param_value
+
+        results = by_param_and_spec[(param_value, spec)]
+        best = min(results, key=lambda r: r.mean_time)
+
+        # Extract original backend name using the mapping
+        backend_name = backend_mapping[best.config.backend]
+
+        # Show all backends' times for comparison
+        times_str = " | ".join(
+            [
+                f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
+                for r in sorted(results, key=lambda r: r.mean_time)
+            ]
+        )
+
+        console.print(
+            f"    {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
+        )
+
+    return all_results
+
+
+def run_parameter_sweep(
+    backends: list[str],
+    batch_specs: list[str],
+    base_config_args: dict,
+    sweep: ParameterSweep,
+    console: Console,
+) -> list[BenchmarkResult]:
+    """
+    Run parameter sweep for given backends and batch specs.
+
+    Args:
+        backends: List of backend names
+        batch_specs: List of batch specifications
+        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
+        sweep: ParameterSweep configuration
+        console: Rich console for output
+
+    Returns:
+        List of BenchmarkResult objects
+    """
+    all_results = []
+
+    # Build list of values to sweep (including auto if requested)
+    sweep_values = list(sweep.values)
+    if sweep.include_auto:
+        sweep_values.append("auto")
+
+    console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
+
+    total = len(backends) * len(batch_specs) * len(sweep_values)
+
+    with tqdm(total=total, desc="Benchmarking") as pbar:
+        for backend in backends:
+            for spec in batch_specs:
+                for value in sweep_values:
+                    # Create config with original backend for running
+                    config = BenchmarkConfig(
+                        backend=backend, batch_spec=spec, **base_config_args
+                    )
+
+                    # Prepare kwargs for benchmark runner
+                    kwargs = {}
+                    if value != "auto":
+                        kwargs[sweep.param_name] = value
+
+                    # Run benchmark
+                    result = run_benchmark(config, **kwargs)
+
+                    # Replace backend with labeled version for display
+                    backend_label = sweep.get_label(backend, value)
+                    labeled_config = replace(result.config, backend=backend_label)
+                    result = replace(result, config=labeled_config)
+                    all_results.append(result)
+
+                    if not result.success:
+                        console.print(
+                            f"[red]Error {backend} {spec} {sweep.param_name}="
+                            f"{value}: {result.error}[/]"
+                        )
+
+                    pbar.update(1)
+
+    # Display sweep results
+    console.print("\n[bold green]Sweep Results:[/]")
+    backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
+    formatter = ResultsFormatter(console)
+    formatter.print_table(all_results, backend_labels)
+
+    # Show optimal values
+    console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
+    by_spec = {}
+    for r in all_results:
+        if r.success:
+            spec = r.config.batch_spec
+            if spec not in by_spec:
+                by_spec[spec] = []
+            by_spec[spec].append(r)
+
+    for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
+        results = by_spec[spec]
+        best = min(results, key=lambda r: r.mean_time)
+        console.print(
+            f"  {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
+        )
+
+    return all_results
+
+
+def load_config_from_yaml(config_path: str) -> dict:
+    """Load configuration from YAML file."""
+    with open(config_path) as f:
+        return yaml.safe_load(f)
+
+
+def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
+    """
+    Generate batch specs from range specifications.
+
+    Args:
+        ranges: List of range specifications, each containing:
+            - template: Batch spec template (e.g., "q{q_len}kv1k")
+            - q_len: Dict with start, stop, step, end_inclusive (optional)
+            - Other parameters can also be ranges
+
+    Returns:
+        List of generated batch spec strings
+
+    Example:
+        ranges = [
+            {
+                "template": "q{q_len}kv1k",
+                "q_len": {
+                    "start": 1,
+                    "stop": 16,
+                    "step": 1,
+                    "end_inclusive": true  # Optional, defaults to true
+                }
+            }
+        ]
+        Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
+    """
+    all_specs = []
+
+    for range_spec in ranges:
+        template = range_spec.get("template")
+        if not template:
+            raise ValueError("Range specification must include 'template'")
+
+        # Extract all range parameters from the spec
+        range_params = {}
+        for key, value in range_spec.items():
+            if key == "template":
+                continue
+            if isinstance(value, dict) and "start" in value:
+                # This is a range specification
+                start = value["start"]
+                stop = value["stop"]
+                step = value.get("step", 1)
+                # Check if end should be inclusive (default: True)
+                end_inclusive = value.get("end_inclusive", True)
+
+                # Adjust stop based on end_inclusive
+                if end_inclusive:
+                    range_params[key] = list(range(start, stop + 1, step))
+                else:
+                    range_params[key] = list(range(start, stop, step))
+            else:
+                # This is a fixed value
+                range_params[key] = [value]
+
+        # Generate all combinations (Cartesian product)
+        if range_params:
+            import itertools
+
+            param_names = list(range_params.keys())
+            param_values = [range_params[name] for name in param_names]
+
+            for values in itertools.product(*param_values):
+                params = dict(zip(param_names, values))
+                spec = template.format(**params)
+                all_specs.append(spec)
+        else:
+            # No parameters, just use template as-is
+            all_specs.append(template)
+
+    return all_specs
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Universal vLLM attention benchmark",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    # Config file
+    parser.add_argument(
+        "--config",
+        help="Path to YAML config file (overrides other args)",
+    )
+
+    # Backend selection
+    parser.add_argument(
+        "--backends",
+        nargs="+",
+        help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
+        "flashinfer_mla, flashattn_mla, flashmla)",
+    )
+    parser.add_argument(
+        "--backend",
+        help="Single backend (alternative to --backends)",
+    )
+
+    # Batch specifications
+    parser.add_argument(
+        "--batch-specs",
+        nargs="+",
+        default=["q2k", "8q1s1k"],
+        help="Batch specifications using extended grammar",
+    )
+
+    # Model config
+    parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
+    parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
+    parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
+    parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
+    parser.add_argument("--block-size", type=int, default=16, help="Block size")
+
+    # Benchmark settings
+    parser.add_argument("--device", default="cuda:0", help="Device")
+    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
+    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
+    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
+
+    # Parameter sweep (use YAML config for advanced sweeps)
+    parser.add_argument(
+        "--sweep-param",
+        help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
+    )
+    parser.add_argument(
+        "--sweep-values",
+        type=int,
+        nargs="+",
+        help="Values to sweep for the parameter",
+    )
+
+    # Output
+    parser.add_argument("--output-csv", help="Save to CSV")
+    parser.add_argument("--output-json", help="Save to JSON")
+
+    args = parser.parse_args()
+
+    console = Console()
+    console.print("[bold cyan]vLLM Attention Benchmark[/]")
+
+    # Load config from YAML if provided
+    if args.config:
+        console.print(f"[yellow]Loading config from: {args.config}[/]")
+        yaml_config = load_config_from_yaml(args.config)
+
+        # Show description if available
+        if "description" in yaml_config:
+            console.print(f"[dim]{yaml_config['description']}[/]")
+
+        # Override args with YAML values, but CLI args take precedence
+        # Check if CLI provided backends (they would be non-None and not default)
+        cli_backends_provided = args.backends is not None or args.backend is not None
+
+        # Backend(s) - only use YAML if CLI didn't specify
+        if not cli_backends_provided:
+            if "backend" in yaml_config:
+                args.backend = yaml_config["backend"]
+                args.backends = None
+            elif "backends" in yaml_config:
+                args.backends = yaml_config["backends"]
+                args.backend = None
+
+        # Check for special modes
+        if "mode" in yaml_config:
+            args.mode = yaml_config["mode"]
+        else:
+            args.mode = None
+
+        # Batch specs and sizes
+        # Support both explicit batch_specs and generated batch_spec_ranges
+        if "batch_spec_ranges" in yaml_config:
+            # Generate batch specs from ranges
+            generated_specs = generate_batch_specs_from_ranges(
+                yaml_config["batch_spec_ranges"]
+            )
+            # Combine with any explicit batch_specs
+            if "batch_specs" in yaml_config:
+                args.batch_specs = yaml_config["batch_specs"] + generated_specs
+            else:
+                args.batch_specs = generated_specs
+            console.print(
+                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
+            )
+        elif "batch_specs" in yaml_config:
+            args.batch_specs = yaml_config["batch_specs"]
+
+        if "batch_sizes" in yaml_config:
+            args.batch_sizes = yaml_config["batch_sizes"]
+        else:
+            args.batch_sizes = None
+
+        # Model config
+        if "model" in yaml_config:
+            model = yaml_config["model"]
+            args.num_layers = model.get("num_layers", args.num_layers)
+            args.head_dim = model.get("head_dim", args.head_dim)
+            args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
+            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
+            args.block_size = model.get("block_size", args.block_size)
+
+        # Benchmark settings (top-level keys)
+        if "device" in yaml_config:
+            args.device = yaml_config["device"]
+        if "repeats" in yaml_config:
+            args.repeats = yaml_config["repeats"]
+        if "warmup_iters" in yaml_config:
+            args.warmup_iters = yaml_config["warmup_iters"]
+        if "profile_memory" in yaml_config:
+            args.profile_memory = yaml_config["profile_memory"]
+
+        # Parameter sweep configuration
+        if "parameter_sweep" in yaml_config:
+            sweep_config = yaml_config["parameter_sweep"]
+            args.parameter_sweep = ParameterSweep(
+                param_name=sweep_config["param_name"],
+                values=sweep_config["values"],
+                include_auto=sweep_config.get("include_auto", False),
+                label_format=sweep_config.get(
+                    "label_format", "{backend}_{param_name}_{value}"
+                ),
+            )
+        else:
+            args.parameter_sweep = None
+
+        # Model parameter sweep configuration
+        if "model_parameter_sweep" in yaml_config:
+            sweep_config = yaml_config["model_parameter_sweep"]
+            args.model_parameter_sweep = ModelParameterSweep(
+                param_name=sweep_config["param_name"],
+                values=sweep_config["values"],
+                label_format=sweep_config.get(
+                    "label_format", "{backend}_{param_name}_{value}"
+                ),
+            )
+        else:
+            args.model_parameter_sweep = None
+
+        # Output
+        if "output" in yaml_config:
+            output = yaml_config["output"]
+            if "csv" in output and not args.output_csv:
+                args.output_csv = output["csv"]
+            if "json" in output and not args.output_json:
+                args.output_json = output["json"]
+
+        console.print()
+
+    # Handle CLI-based parameter sweep (if not from YAML)
+    if (
+        (not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
+        and args.sweep_param
+        and args.sweep_values
+    ):
+        args.parameter_sweep = ParameterSweep(
+            param_name=args.sweep_param,
+            values=args.sweep_values,
+            include_auto=False,
+            label_format="{backend}_{param_name}_{value}",
+        )
+
+    # Determine backends
+    backends = args.backends or ([args.backend] if args.backend else ["flash"])
+    console.print(f"Backends: {', '.join(backends)}")
+    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
+    console.print()
+
+    # Run benchmarks
+    all_results = []
+
+    # Handle special mode: decode_vs_prefill comparison
+    if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
+        console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
+        console.print(
+            "[dim]For each query length, testing both decode and prefill pipelines[/]"
+        )
+        console.print("[dim]Using batched execution for optimal performance[/]")
+
+        # Extract batch sizes from config
+        batch_sizes = getattr(args, "batch_sizes", [1])
+        backend = backends[0]  # Use first backend (should only be one)
+
+        # Calculate total benchmarks
+        total = len(batch_sizes)
+
+        with tqdm(total=total, desc="Benchmarking") as pbar:
+            for batch_size in batch_sizes:
+                # Prepare all configs for this batch size
+                configs_with_thresholds = []
+
+                for spec in args.batch_specs:
+                    # Parse the batch spec to get query length
+                    requests = parse_batch_spec(spec)
+                    if not requests:
+                        console.print(
+                            f"[red]Error: Could not parse batch spec '{spec}'[/]"
+                        )
+                        continue
+
+                    # Get query length from first request
+                    query_length = requests[0].q_len
+
+                    # Create batch spec for this batch size
+                    # For batch_size > 1, we need to prepend the count
+                    batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
+
+                    # Create base config (without backend name)
+                    base_config = BenchmarkConfig(
+                        backend=backend,  # Will be overridden later
+                        batch_spec=batch_spec,
+                        num_layers=args.num_layers,
+                        head_dim=args.head_dim,
+                        num_q_heads=args.num_q_heads,
+                        num_kv_heads=args.num_kv_heads,
+                        block_size=args.block_size,
+                        device=args.device,
+                        repeats=args.repeats,
+                        warmup_iters=args.warmup_iters,
+                        profile_memory=args.profile_memory,
+                    )
+
+                    # Add decode pipeline config
+                    decode_threshold = query_length
+                    config_decode = replace(
+                        base_config,
+                        backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
+                    )
+                    configs_with_thresholds.append((config_decode, decode_threshold))
+
+                    # Add prefill pipeline config if query_length > 1
+                    if query_length > 1:
+                        prefill_threshold = query_length - 1
+                        config_prefill = replace(
+                            base_config,
+                            backend=f"{backend}_prefill_qlen{query_length}"
+                            f"_bs{batch_size}",
+                        )
+                        configs_with_thresholds.append(
+                            (config_prefill, prefill_threshold)
+                        )
+
+                # Run all benchmarks for this batch size in one go (batched mode)
+                try:
+                    from mla_runner import run_mla_benchmark as run_mla
+
+                    # Use batched API: pass list of (config, threshold) tuples
+                    timing_results = run_mla(backend, configs_with_thresholds)
+
+                    # Create BenchmarkResult objects from timing results
+                    for (config, _), timing in zip(
+                        configs_with_thresholds, timing_results
+                    ):
+                        result = BenchmarkResult(
+                            config=config,
+                            mean_time=timing["mean"],
+                            std_time=timing["std"],
+                            min_time=timing["min"],
+                            max_time=timing["max"],
+                            throughput_tokens_per_sec=timing.get("throughput", None),
+                        )
+                        all_results.append(result)
+
+                except Exception as e:
+                    import traceback
+
+                    console.print(
+                        f"[red]Error running batched benchmarks for "
+                        f"batch_size={batch_size}: {e}[/]"
+                    )
+                    console.print("[red]Traceback:[/]")
+                    traceback.print_exc()
+                    # Add error results for all configs
+                    for config, _ in configs_with_thresholds:
+                        result = BenchmarkResult(
+                            config=config,
+                            mean_time=float("inf"),
+                            std_time=0,
+                            min_time=float("inf"),
+                            max_time=float("inf"),
+                            error=str(e),
+                        )
+                        all_results.append(result)
+
+                pbar.update(1)
+
+        # Display decode vs prefill results
+        console.print("\n[bold green]Decode vs Prefill Results:[/]")
+
+        # Group by batch size
+        by_batch_size = {}
+        for r in all_results:
+            if r.success:
+                # Extract batch size from backend name
+                parts = r.config.backend.split("_")
+                bs_part = [p for p in parts if p.startswith("bs")]
+                if bs_part:
+                    bs = int(bs_part[0][2:])
+                    if bs not in by_batch_size:
+                        by_batch_size[bs] = []
+                    by_batch_size[bs].append(r)
+
+        # For each batch size, analyze crossover point
+        for bs in sorted(by_batch_size.keys()):
+            console.print(f"\n[bold cyan]Batch size: {bs}[/]")
+            results = by_batch_size[bs]
+
+            # Group by query length
+            by_qlen = {}
+            for r in results:
+                parts = r.config.backend.split("_")
+                qlen_part = [p for p in parts if p.startswith("qlen")]
+                if qlen_part:
+                    qlen = int(qlen_part[0][4:])
+                    if qlen not in by_qlen:
+                        by_qlen[qlen] = {}
+
+                    pipeline = "decode" if "decode" in r.config.backend else "prefill"
+                    by_qlen[qlen][pipeline] = r
+
+            # Find crossover point
+            last_decode_faster = None
+            for qlen in sorted(by_qlen.keys()):
+                pipelines = by_qlen[qlen]
+                if "decode" in pipelines and "prefill" in pipelines:
+                    decode_time = pipelines["decode"].mean_time
+                    prefill_time = pipelines["prefill"].mean_time
+                    faster = "decode" if decode_time < prefill_time else "prefill"
+
+                    speedup = (
+                        prefill_time / decode_time
+                        if decode_time < prefill_time
+                        else decode_time / prefill_time
+                    )
+
+                    console.print(
+                        f"  qlen={qlen:3d}: decode={decode_time:.6f}s, "
+                        f"prefill={prefill_time:.6f}s -> "
+                        f"[bold]{faster}[/] ({speedup:.2f}x)"
+                    )
+
+                    if faster == "decode":
+                        last_decode_faster = qlen
+
+            if last_decode_faster is not None:
+                optimal_threshold = last_decode_faster
+                console.print(
+                    f"\n  [bold green]Optimal threshold for batch_size={bs}: "
+                    f"{optimal_threshold}[/]"
+                )
+                console.print(
+                    f"  [dim](Use decode pipeline for query_length <= "
+                    f"{optimal_threshold})[/]"
+                )
+            else:
+                console.print(
+                    f"\n  [yellow]Prefill always faster for batch_size={bs}[/]"
+                )
+
+    # Handle model parameter sweep mode
+    elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
+        # Model parameter sweep
+        base_config_args = {
+            "num_layers": args.num_layers,
+            "head_dim": args.head_dim,
+            "num_q_heads": args.num_q_heads,
+            "num_kv_heads": args.num_kv_heads,
+            "block_size": args.block_size,
+            "device": args.device,
+            "repeats": args.repeats,
+            "warmup_iters": args.warmup_iters,
+            "profile_memory": args.profile_memory,
+        }
+        all_results = run_model_parameter_sweep(
+            backends,
+            args.batch_specs,
+            base_config_args,
+            args.model_parameter_sweep,
+            console,
+        )
+
+    # Handle parameter sweep mode (unified)
+    elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
+        # Unified parameter sweep
+        base_config_args = {
+            "num_layers": args.num_layers,
+            "head_dim": args.head_dim,
+            "num_q_heads": args.num_q_heads,
+            "num_kv_heads": args.num_kv_heads,
+            "block_size": args.block_size,
+            "device": args.device,
+            "repeats": args.repeats,
+            "warmup_iters": args.warmup_iters,
+            "profile_memory": args.profile_memory,
+        }
+        all_results = run_parameter_sweep(
+            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
+        )
+
+    else:
+        # Normal mode: compare backends
+        total = len(backends) * len(args.batch_specs)
+
+        with tqdm(total=total, desc="Benchmarking") as pbar:
+            for spec in args.batch_specs:
+                for backend in backends:
+                    config = BenchmarkConfig(
+                        backend=backend,
+                        batch_spec=spec,
+                        num_layers=args.num_layers,
+                        head_dim=args.head_dim,
+                        num_q_heads=args.num_q_heads,
+                        num_kv_heads=args.num_kv_heads,
+                        block_size=args.block_size,
+                        device=args.device,
+                        repeats=args.repeats,
+                        warmup_iters=args.warmup_iters,
+                        profile_memory=args.profile_memory,
+                    )
+
+                    result = run_benchmark(config)
+                    all_results.append(result)
+
+                    if not result.success:
+                        console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
+
+                    pbar.update(1)
+
+        # Display results
+        console.print("\n[bold green]Results:[/]")
+        formatter = ResultsFormatter(console)
+        formatter.print_table(all_results, backends)
+
+    # Save results
+    if all_results:
+        formatter = ResultsFormatter(console)
+        if args.output_csv:
+            formatter.save_csv(all_results, args.output_csv)
+        if args.output_json:
+            formatter.save_json(all_results, args.output_json)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Common utilities for attention benchmarking."""
+
+import csv
+import json
+import math
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+
+import torch
+from batch_spec import get_batch_type, parse_batch_spec
+from rich.console import Console
+from rich.table import Table
+
+
+def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
+    """
+    Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
+
+    This ensures results are sorted by batch size first, then query length,
+    then sequence length, rather than alphabetically.
+    """
+    try:
+        requests = parse_batch_spec(spec)
+        batch_size = len(requests)
+        max_q_len = max(r.q_len for r in requests) if requests else 0
+        max_kv_len = max(r.kv_len for r in requests) if requests else 0
+        return (batch_size, max_q_len, max_kv_len)
+    except Exception:
+        # Fallback for unparseable specs
+        return (0, 0, 0)
+
+
+# Mock classes for vLLM attention infrastructure
+
+
+class MockHfConfig:
+    """Mock HuggingFace config that satisfies vLLM's requirements."""
+
+    def __init__(self, mla_dims: dict, index_topk: int | None = None):
+        self.num_attention_heads = mla_dims["num_q_heads"]
+        self.num_key_value_heads = mla_dims["num_kv_heads"]
+        self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
+        self.model_type = "deepseek_v2"
+        self.is_encoder_decoder = False
+        self.kv_lora_rank = mla_dims["kv_lora_rank"]
+        self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
+        self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
+        self.v_head_dim = mla_dims["v_head_dim"]
+        self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
+        if index_topk is not None:
+            self.index_topk = index_topk
+
+    def get_text_config(self):
+        return self
+
+
+# Import AttentionLayerBase at module level to avoid circular dependencies
+try:
+    from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+except ImportError:
+    AttentionLayerBase = object  # Fallback
+
+
+class MockKVBProj:
+    """Mock KV projection layer for MLA prefill mode.
+
+    Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
+    Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
+    """
+
+    def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
+        self.num_heads = num_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.v_head_dim = v_head_dim
+        self.out_dim = qk_nope_head_dim + v_head_dim
+
+    def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
+        """
+        Project kv_c_normed to output space.
+
+        Args:
+            x: Input tensor [num_tokens, kv_lora_rank]
+
+        Returns:
+            Tuple containing output tensor
+                [num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
+        """
+        num_tokens = x.shape[0]
+        result = torch.randn(
+            num_tokens,
+            self.num_heads,
+            self.out_dim,
+            device=x.device,
+            dtype=x.dtype,
+        )
+        return (result,)  # Return as tuple to match ColumnParallelLinear API
+
+
+class MockIndexer:
+    """Mock Indexer for sparse MLA backends.
+
+    Provides topk_indices_buffer that sparse MLA backends use to determine
+    which KV cache slots to attend to for each token.
+    """
+
+    def __init__(
+        self,
+        max_num_tokens: int,
+        topk_tokens: int,
+        device: torch.device,
+    ):
+        self.topk_tokens = topk_tokens
+        self.topk_indices_buffer = torch.zeros(
+            (max_num_tokens, topk_tokens),
+            dtype=torch.int32,
+            device=device,
+        )
+
+    def fill_random_indices(self, num_tokens: int, max_kv_len: int):
+        """Fill topk_indices_buffer with random valid indices for benchmarking."""
+        indices = torch.randint(
+            0,
+            max_kv_len,
+            (num_tokens, self.topk_tokens),
+            dtype=torch.int32,
+            device=self.topk_indices_buffer.device,
+        )
+        self.topk_indices_buffer[:num_tokens] = indices
+
+
+class MockLayer(AttentionLayerBase):
+    """Mock attention layer with scale parameters and impl.
+
+    Inherits from AttentionLayerBase so it passes isinstance checks
+    in get_layers_from_vllm_config when FlashInfer prefill is enabled.
+    """
+
+    def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
+        # Don't call super().__init__() as AttentionLayerBase doesn't have __init__
+        self._k_scale = torch.tensor(1.0, device=device)
+        self._v_scale = torch.tensor(1.0, device=device)
+        self._q_scale = torch.tensor(1.0, device=device)
+        # Scalar floats for kernels that need them
+        self._k_scale_float = float(self._k_scale.item())
+        self._v_scale_float = float(self._v_scale.item())
+        self._q_scale_float = float(self._q_scale.item())
+        # AttentionImpl for metadata builders to query
+        self.impl = impl
+        # KV cache spec for get_kv_cache_spec
+        self._kv_cache_spec = kv_cache_spec
+
+    def get_attn_backend(self):
+        """Get the attention backend class (required by AttentionLayerBase)."""
+        # Return None as this is just a mock layer for benchmarking
+        return None
+
+    def get_kv_cache_spec(self):
+        """Get the KV cache spec (required by AttentionLayerBase)."""
+        return self._kv_cache_spec
+
+
+@dataclass
+class ParameterSweep:
+    """Configuration for sweeping a backend parameter."""
+
+    param_name: str  # Name of the backend parameter to sweep
+    values: list[Any]  # List of values to test
+    include_auto: bool = False  # Also test with param unset (auto mode)
+    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
+
+    def get_label(self, backend: str, value: Any) -> str:
+        """Generate a label for a specific parameter value."""
+        return self.label_format.format(
+            backend=backend, param_name=self.param_name, value=value
+        )
+
+
+@dataclass
+class ModelParameterSweep:
+    """Configuration for sweeping a model configuration parameter."""
+
+    param_name: str  # Name of the model config parameter to sweep (e.g., "num_q_heads")
+    values: list[Any]  # List of values to test
+    label_format: str = "{backend}_{param_name}_{value}"  # Result label template
+
+    def get_label(self, backend: str, value: Any) -> str:
+        """Generate a label for a specific parameter value."""
+        return self.label_format.format(
+            backend=backend, param_name=self.param_name, value=value
+        )
+
+
+@dataclass
+class BenchmarkConfig:
+    """Configuration for a single benchmark run."""
+
+    backend: str
+    batch_spec: str
+    num_layers: int
+    head_dim: int
+    num_q_heads: int
+    num_kv_heads: int
+    block_size: int
+    device: str
+    dtype: torch.dtype = torch.float16
+    repeats: int = 1
+    warmup_iters: int = 3
+    profile_memory: bool = False
+    use_cuda_graphs: bool = False
+
+    # MLA-specific
+    kv_lora_rank: int | None = None
+    qk_nope_head_dim: int | None = None
+    qk_rope_head_dim: int | None = None
+    v_head_dim: int | None = None
+
+    # Backend-specific tuning
+    num_kv_splits: int | None = None  # CUTLASS MLA
+    reorder_batch_threshold: int | None = None  # FlashAttn MLA, FlashMLA
+
+
+@dataclass
+class BenchmarkResult:
+    """Results from a single benchmark run."""
+
+    config: BenchmarkConfig
+    mean_time: float  # seconds
+    std_time: float  # seconds
+    min_time: float  # seconds
+    max_time: float  # seconds
+    throughput_tokens_per_sec: float | None = None
+    memory_allocated_mb: float | None = None
+    memory_reserved_mb: float | None = None
+    error: str | None = None
+
+    @property
+    def success(self) -> bool:
+        """Whether benchmark completed successfully."""
+        return self.error is None
+
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "config": asdict(self.config),
+            "mean_time": self.mean_time,
+            "std_time": self.std_time,
+            "min_time": self.min_time,
+            "max_time": self.max_time,
+            "throughput_tokens_per_sec": self.throughput_tokens_per_sec,
+            "memory_allocated_mb": self.memory_allocated_mb,
+            "memory_reserved_mb": self.memory_reserved_mb,
+            "error": self.error,
+        }
+
+
+class ResultsFormatter:
+    """Format and display benchmark results."""
+
+    def __init__(self, console: Console | None = None):
+        self.console = console or Console()
+
+    def print_table(
+        self,
+        results: list[BenchmarkResult],
+        backends: list[str],
+        compare_to_fastest: bool = True,
+    ):
+        """
+        Print results as a rich table.
+
+        Args:
+            results: List of BenchmarkResult
+            backends: List of backend names being compared
+            compare_to_fastest: Show percentage comparison to fastest
+        """
+        # Group by batch spec, preserving first-occurrence order
+        by_spec = {}
+        specs_order = []
+        for r in results:
+            spec = r.config.batch_spec
+            if spec not in by_spec:
+                by_spec[spec] = {}
+                specs_order.append(spec)
+            by_spec[spec][r.config.backend] = r
+
+        # Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
+        specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
+
+        # Create shortened backend names for display
+        def shorten_backend_name(name: str) -> str:
+            """Shorten long backend names for table display."""
+            # Remove common prefixes
+            name = name.replace("flashattn_mla", "famla")
+            name = name.replace("flashinfer_mla", "fimla")
+            name = name.replace("flashmla", "fmla")
+            name = name.replace("cutlass_mla", "cmla")
+            name = name.replace("numsplits", "ns")
+            return name
+
+        table = Table(title="Attention Benchmark Results")
+        table.add_column("Batch\nSpec", no_wrap=True)
+        table.add_column("Type", no_wrap=True)
+        table.add_column("Batch\nSize", justify="right", no_wrap=True)
+
+        multi = len(backends) > 1
+        for backend in backends:
+            short_name = shorten_backend_name(backend)
+            # Time column
+            col_time = f"{short_name}\nTime (s)"
+            table.add_column(col_time, justify="right", no_wrap=False)
+            if multi and compare_to_fastest:
+                # Relative performance column
+                col_rel = f"{short_name}\nvs Best"
+                table.add_column(col_rel, justify="right", no_wrap=False)
+
+        # Add rows
+        for spec in specs_order:
+            spec_results = by_spec[spec]
+            times = {b: r.mean_time for b, r in spec_results.items() if r.success}
+            best_time = min(times.values()) if times else 0.0
+
+            batch_type = get_batch_type(spec)
+            batch_size = len(parse_batch_spec(spec))
+            row = [spec, batch_type, str(batch_size)]
+            for backend in backends:
+                if backend in spec_results:
+                    r = spec_results[backend]
+                    if r.success:
+                        row.append(f"{r.mean_time:.6f}")
+                        if multi and compare_to_fastest:
+                            pct = (
+                                (r.mean_time / best_time * 100) if best_time > 0 else 0
+                            )
+                            pct_str = f"{pct:.1f}%"
+                            if r.mean_time == best_time:
+                                pct_str = f"[bold green]{pct_str}[/]"
+                            row.append(pct_str)
+                    else:
+                        row.append("[red]ERROR[/]")
+                        if multi and compare_to_fastest:
+                            row.append("-")
+                else:
+                    row.append("-")
+                    if multi and compare_to_fastest:
+                        row.append("-")
+
+            table.add_row(*row)
+
+        self.console.print(table)
+
+    def save_csv(self, results: list[BenchmarkResult], path: str):
+        """Save results to CSV file."""
+        if not results:
+            return
+
+        path_obj = Path(path)
+        path_obj.parent.mkdir(parents=True, exist_ok=True)
+
+        with open(path, "w", newline="") as f:
+            writer = csv.DictWriter(
+                f,
+                fieldnames=[
+                    "backend",
+                    "batch_spec",
+                    "num_layers",
+                    "mean_time",
+                    "std_time",
+                    "throughput",
+                    "memory_mb",
+                ],
+            )
+            writer.writeheader()
+            for r in results:
+                writer.writerow(
+                    {
+                        "backend": r.config.backend,
+                        "batch_spec": r.config.batch_spec,
+                        "num_layers": r.config.num_layers,
+                        "mean_time": r.mean_time,
+                        "std_time": r.std_time,
+                        "throughput": r.throughput_tokens_per_sec or 0,
+                        "memory_mb": r.memory_allocated_mb or 0,
+                    }
+                )
+
+        self.console.print(f"[green]Saved CSV results to {path}[/]")
+
+    def save_json(self, results: list[BenchmarkResult], path: str):
+        """Save results to JSON file."""
+        path_obj = Path(path)
+        path_obj.parent.mkdir(parents=True, exist_ok=True)
+
+        data = [r.to_dict() for r in results]
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2, default=str)
+
+        self.console.print(f"[green]Saved JSON results to {path}[/]")
+
+
+def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
+    """
+    Get MLA dimensions for known models.
+
+    Args:
+        model_name: Model identifier
+
+    Returns:
+        Dict with MLA dimension configuration
+    """
+    configs = {
+        "deepseek-v2": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 128,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+        "deepseek-v3": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 128,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+        "deepseek-v2-lite": {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": 16,
+            "num_kv_heads": 1,
+            "head_dim": 576,
+        },
+    }
+
+    if model_name not in configs:
+        raise ValueError(
+            f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
+        )
+
+    return configs[model_name]
+
+
+def get_attention_scale(head_dim: int) -> float:
+    """Compute attention scale factor (1/sqrt(d))."""
+    return 1.0 / math.sqrt(head_dim)
+
+
+def is_mla_backend(backend: str) -> bool:
+    """
+    Check if backend is an MLA backend using the AttentionBackendEnum.
+
+    Args:
+        backend: Backend name matching AttentionBackendEnum exactly
+        (e.g., "FLASHMLA_SPARSE")
+
+    Returns:
+        True if the backend is an MLA backend, False otherwise
+    """
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+    try:
+        backend_enum = AttentionBackendEnum[backend]
+        backend_class = backend_enum.get_class()
+        return backend_class.is_mla()
+    except (KeyError, ValueError, ImportError, AttributeError):
+        return False
--- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml
+# MLA decode-only benchmark configuration
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128  # Base value, can be swept for TP simulation
+  num_kv_heads: 1  # MLA uses single latent KV
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
+
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+
+batch_specs:
+  # Small batches, varying sequence lengths
+  - "16q1s512"     # 16 requests, 512 KV cache
+  - "16q1s1k"      # 16 requests, 1k KV cache
+  - "16q1s2k"      # 16 requests, 2k KV cache
+  - "16q1s4k"      # 16 requests, 4k KV cache
+
+  # Medium batches
+  - "32q1s1k"      # 32 requests, 1k KV cache
+  - "32q1s2k"      # 32 requests, 2k KV cache
+  - "32q1s4k"      # 32 requests, 4k KV cache
+  - "32q1s8k"      # 32 requests, 8k KV cache
+
+  # Large batches
+  - "64q1s1k"      # 64 requests, 1k KV cache
+  - "64q1s2k"      # 64 requests, 2k KV cache
+  - "64q1s4k"      # 64 requests, 4k KV cache
+  - "64q1s8k"      # 64 requests, 8k KV cache
+
+  # Very large batches
+  - "128q1s1k"     # 128 requests, 1k KV cache
+  - "128q1s2k"     # 128 requests, 2k KV cache
+  - "128q1s4k"     # 128 requests, 4k KV cache
+  - "128q1s8k"     # 128 requests, 8k KV cache
+
+  # Long context
+  - "32q1s16k"     # 32 requests, 16k KV cache
+  - "32q1s32k"     # 32 requests, 32k KV cache
+
+backends:
+  - CUTLASS_MLA
+  - FLASHINFER_MLA
+  - FLASH_ATTN_MLA  # Hopper only
+  - FLASHMLA        # Hopper only
+
+device: "cuda:0"
+repeats: 100
+warmup_iters: 10
+profile_memory: true
+
+# Backend-specific tuning
+CUTLASS_MLA:
+  num_kv_splits: auto  # or specific value like 4, 8, 16
+
+FLASH_ATTN_MLA:
+  reorder_batch_threshold: 512
+
+FLASHMLA:
+  reorder_batch_threshold: 1
--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+# MLA mixed batch benchmark (prefill + decode)
+# Tests chunked prefill performance
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128
+
+batch_specs:
+  # Small prefill + decode
+  - "1q1k_8q1s1k"           # 1 prefill + 8 decode
+  - "2q2k_16q1s1k"          # 2 prefill + 16 decode
+  - "4q1k_32q1s2k"          # 4 prefill + 32 decode
+
+  # Medium prefill + decode
+  - "2q4k_32q1s2k"          # 2 medium prefill + 32 decode
+  - "4q4k_64q1s2k"          # 4 medium prefill + 64 decode
+  - "8q2k_64q1s4k"          # 8 prefill + 64 decode
+
+  # Large prefill + decode (chunked prefill stress test)
+  - "2q8k_32q1s1k"          # 2 large prefill + 32 decode
+  - "1q16k_16q1s2k"         # 1 very large prefill + 16 decode
+  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode
+
+  # Context extension + decode
+  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
+  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
+  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
+
+  # Explicitly chunked prefill
+  - "q8k"           # 8k prefill with chunking hint
+  - "q16k"          # 16k prefill with chunking hint
+  - "2q8k_32q1s2k"    # 2 chunked prefill + 32 decode
+
+  # High decode ratio (realistic serving)
+  - "1q2k_63q1s1k"          # 1 prefill + 63 decode
+  - "2q2k_62q1s2k"          # 2 prefill + 62 decode
+  - "4q4k_60q1s4k"          # 4 prefill + 60 decode
+
+backends:
+  - CUTLASS_MLA
+  - FLASHINFER_MLA
+  - FLASH_ATTN_MLA   # Hopper only
+  - FLASHMLA         # Hopper only
+
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: true
+
+# Analyze chunked prefill workspace size impact
+chunked_prefill:
+  test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
--- a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml
+# MLA prefill-only benchmark configuration for sparse backends
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128
+
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+
+batch_specs:
+  # Pure prefill
+  - "1q512"
+  - "1q1k"
+  - "1q2k"
+  - "1q4k"
+  - "1q8k"
+
+  # Batched pure prefill
+  - "2q512"
+  - "2q1k"
+  - "2q2k"
+  - "2q4k"
+  - "2q8k"
+  - "4q512"
+  - "4q1k"
+  - "4q2k"
+  - "4q4k"
+  - "4q8k"
+  - "8q512"
+  - "8q1k"
+  - "8q2k"
+  - "8q4k"
+  - "8q8k"
+
+  # Extend
+  - "1q512s4k"
+  - "1q512s8k"
+  - "1q1ks8k"
+  - "1q2ks8k"
+  - "1q2ks16k"
+  - "1q4ks16k"
+
+backends:
+  - FLASHMLA_SPARSE
+  - FLASHINFER_MLA_SPARSE
+
+device: "cuda:0"
+repeats: 10
+warmup_iters: 3
+profile_memory: true
--- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml
+# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
+# Question: At what query length does prefill pipeline become faster than decode pipeline?
+# Methodology: For each query length, compare decode vs prefill performance to find crossover point
+# Applies to: FlashAttn MLA, FlashMLA
+
+description: "Decode vs Prefill pipeline crossover analysis"
+
+# Test FlashAttn MLA
+backend: FLASH_ATTN_MLA
+
+# Mode: decode_vs_prefill comparison (special sweep mode)
+# For each batch spec, we'll test both decode and prefill pipelines
+mode: "decode_vs_prefill"
+
+# Query lengths to test (from old benchmark_mla_threshold.py methodology)
+# Each query length will be tested with BOTH decode and prefill pipelines:
+#   - decode: threshold >= query_length (forces decode pipeline)
+#   - prefill: threshold < query_length (forces prefill pipeline)
+#
+# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
+# This tests different query lengths with fixed sequence length context
+#
+# Using batch_spec_ranges for automatic generation:
+batch_spec_ranges:
+  - template: "q{q_len}s1k"
+    q_len:
+      start: 1
+      stop: 16
+      step: 1
+      end_inclusive: false
+  - template: "q{q_len}s1k"
+    q_len:
+      start: 16
+      stop: 64
+      step: 2
+      end_inclusive: false
+  - template: "q{q_len}s1k"
+    q_len:
+      start: 64
+      stop: 1024
+      step: 4
+      end_inclusive: true
+
+# Batch sizes to test (from old script)
+batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
+
+# Model configuration (DeepSeek V2/V3 defaults)
+model:
+  num_layers: 10
+  head_dim: 576
+  num_q_heads: 128
+  num_kv_heads: 1
+  block_size: 128
+
+# Benchmark settings
+device: "cuda:0"
+repeats: 15          # More repeats for spec decode variance
+warmup_iters: 5
+profile_memory: false
+
+# Output
+output:
+  csv: "reorder_threshold_results.csv"
+  json: "reorder_threshold_results.json"
+
+# Expected outcome (reproduces old benchmark_mla_threshold.py study):
+# - For each batch size, find the crossover point where prefill becomes faster than decode
+# - Show decode vs prefill performance across all query lengths
+# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
+# - Understand how crossover point varies with batch size
+# - Provide data-driven guidance for default threshold value
+#
+# Methodology (from old script):
+# - Each query length tested with BOTH pipelines:
+#     * decode: threshold >= query_length (forces decode pipeline)
+#     * prefill: threshold < query_length (forces prefill pipeline)
+# - Compare which is faster to find crossover point
+#
--- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml
+# Speculative decoding benchmark configuration
+# Tests reorder_batch_threshold optimization
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128
+  num_kv_heads: 1
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+
+batch_specs:
+  # Pure speculative decode (K-token verification)
+  - "q2s1k"      # 2-token spec, 1k KV
+  - "q4s1k"      # 4-token spec, 1k KV
+  - "q8s1k"      # 8-token spec, 1k KV
+  - "q16s1k"     # 16-token spec, 1k KV
+
+  # Speculative with different context lengths
+  - "q4s2k"      # 4-token spec, 2k KV
+  - "q4s4k"      # 4-token spec, 4k KV
+  - "q8s2k"      # 8-token spec, 2k KV
+  - "q8s4k"      # 8-token spec, 4k KV
+
+  # Mixed: speculative + regular decode
+  - "32q4s1k"                    # 32 spec requests
+  - "16q4s1k_16q1s1k"              # 16 spec + 16 regular
+  - "8q8s2k_24q1s2k"               # 8 spec (8-tok) + 24 regular
+
+  # Mixed: speculative + prefill + decode
+  - "2q1k_16q4s1k_16q1s1k"         # 2 prefill + 16 spec + 16 decode
+  - "4q2k_32q4s2k_32q1s2k"         # 4 prefill + 32 spec + 32 decode
+
+  # Large batches with speculation
+  - "64q4s1k"                    # 64 spec requests
+  - "32q8s2k"                    # 32 spec (8-token)
+  - "16q16s4k"                   # 16 spec (16-token)
+
+# Backends that support query length > 1
+backends:
+  - FLASH_ATTN_MLA    # reorder_batch_threshold = 512
+  - FLASHMLA          # reorder_batch_threshold = 1 (tunable)
+
+# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
+# - FLASHINFER_MLA
+
+# Benchmark settings
+device: "cuda:0"
+repeats: 10  # More repeats for statistical significance
+warmup_iters: 5
+profile_memory: false
+
+# Test these threshold values for optimization
+parameter_sweep:
+  param_name: "reorder_batch_threshold"
+  values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
+  include_auto: false
+  label_format: "{backend}_threshold_{value}"
--- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml
+# Standard attention backend benchmark configuration
+
+model:
+  num_layers: 32
+  num_q_heads: 32
+  num_kv_heads: 8  # GQA with 4:1 ratio
+  head_dim: 128
+  block_size: 16
+
+batch_specs:
+  # Pure prefill
+  - "q512"      # Small prefill (512 tokens)
+  - "q2k"       # Medium prefill (2048 tokens)
+  - "q4k"       # Large prefill (4096 tokens)
+  - "q8k"       # Very large prefill (8192 tokens)
+
+  # Pure decode
+  - "8q1s1k"      # 8 requests, 1k KV cache each
+  - "16q1s2k"     # 16 requests, 2k KV cache each
+  - "32q1s1k"     # 32 requests, 1k KV cache each
+  - "64q1s4k"     # 64 requests, 4k KV cache each
+
+  # Mixed prefill/decode
+  - "2q2k_8q1s1k"      # 2 prefill + 8 decode
+  - "4q1k_16q1s2k"     # 4 prefill + 16 decode
+  - "2q4k_32q1s1k"     # 2 large prefill + 32 decode
+
+  # Speculative decode (q <= 8)
+  - "16q2s1k"         # 16 requests, 2 spec tokens, 1k KV cache
+  - "16q4s1k"         # 16 requests, 4 spec tokens, 1k KV cache
+  - "16q8s1k"         # 16 requests, 8 spec tokens, 1k KV cache
+  - "32q4s2k"         # 32 requests, 4 spec tokens, 2k KV cache
+  - "8q8s4k"          # 8 requests, 8 spec tokens, 4k KV cache
+
+  # Context extension (chunked prefill)
+  - "q1ks2k"          # 1k query, 2k sequence
+  - "2q1ks4k"         # 2 requests: 1k query, 4k sequence
+
+# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
+backends:
+  - FLASH_ATTN
+  - TRITON_ATTN
+  - FLASHINFER
+
+device: "cuda:0"
+repeats: 5
+warmup_iters: 3
+profile_memory: false
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+MLA benchmark runner - shared utilities for MLA benchmarks.
+
+This module provides helpers for running MLA backends without
+needing full VllmConfig integration.
+"""
+
+import numpy as np
+import torch
+from batch_spec import parse_batch_spec
+from common import (
+    BenchmarkResult,
+    MockHfConfig,
+    MockIndexer,
+    MockKVBProj,
+    MockLayer,
+    setup_mla_dims,
+)
+
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
+
+# ============================================================================
+# VllmConfig Creation
+# ============================================================================
+
+
+def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
+    """
+    Add mock methods for layer-specific queries to ModelConfig.
+
+    These methods are needed by metadata builders but aren't normally
+    present on ModelConfig when used in benchmark contexts.
+    """
+    import types
+
+    model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
+    model_config.get_sliding_window_for_layer = types.MethodType(
+        lambda self, _i: None, model_config
+    )
+    model_config.get_logits_soft_cap_for_layer = types.MethodType(
+        lambda self, _i: None, model_config
+    )
+    model_config.get_sm_scale_for_layer = types.MethodType(
+        lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
+    )
+
+
+def create_minimal_vllm_config(
+    model_name: str = "deepseek-v3",
+    block_size: int = 128,
+    max_num_seqs: int = 256,
+    mla_dims: dict | None = None,
+    index_topk: int | None = None,
+) -> VllmConfig:
+    """
+    Create minimal VllmConfig for MLA benchmarks.
+
+    Args:
+        model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
+                    provided
+        block_size: KV cache block size
+        max_num_seqs: Maximum number of sequences
+        mla_dims: Optional custom MLA dimensions dict. If not provided, uses
+                  setup_mla_dims(model_name)
+        index_topk: Optional topk value for sparse MLA backends. If provided,
+                    the config will include index_topk for sparse attention.
+
+    Returns:
+        VllmConfig for benchmarking
+    """
+    # Get MLA dimensions - use provided or load from model name
+    if mla_dims is None:
+        mla_dims = setup_mla_dims(model_name)
+
+    # Create mock HF config first (avoids downloading from HuggingFace)
+    mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
+
+    # Create a temporary minimal config.json to avoid HF downloads
+    # This ensures consistent ModelConfig construction without network access
+    import json
+    import os
+    import shutil
+    import tempfile
+
+    minimal_config = {
+        "architectures": ["DeepseekV2ForCausalLM"],
+        "model_type": "deepseek_v2",
+        "num_attention_heads": mla_dims["num_q_heads"],
+        "num_key_value_heads": mla_dims["num_kv_heads"],
+        "hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
+        "torch_dtype": "bfloat16",
+        "max_position_embeddings": 163840,  # DeepSeek V3 default
+        "rope_theta": 10000.0,
+        "vocab_size": 128256,
+    }
+
+    # Create temporary directory with config.json
+    temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
+    config_path = os.path.join(temp_dir, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(minimal_config, f)
+
+    try:
+        # Create model config using local path - no HF downloads
+        model_config = ModelConfig(
+            model=temp_dir,  # Use local temp directory
+            tokenizer=None,
+            tokenizer_mode="auto",
+            trust_remote_code=True,
+            dtype="bfloat16",
+            seed=0,
+            max_model_len=32768,
+            quantization=None,
+            enforce_eager=False,
+            max_logprobs=20,
+            disable_sliding_window=False,
+            skip_tokenizer_init=True,
+            served_model_name=None,
+            limit_mm_per_prompt=None,
+            config_format="auto",
+        )
+    finally:
+        # Clean up temporary directory
+        shutil.rmtree(temp_dir, ignore_errors=True)
+
+    # Override with our mock config
+    model_config.hf_config = mock_hf_config
+    model_config.hf_text_config = mock_hf_config
+
+    # Add mock methods for layer-specific queries
+    _add_mock_methods_to_model_config(model_config)
+
+    # Create sub-configs
+    cache_config = CacheConfig(
+        block_size=block_size,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+        enable_prefix_caching=False,
+    )
+
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=8192,
+        max_model_len=32768,
+        is_encoder_decoder=False,
+        enable_chunked_prefill=True,
+    )
+
+    parallel_config = ParallelConfig(
+        tensor_parallel_size=1,
+    )
+
+    compilation_config = CompilationConfig()
+
+    return VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        parallel_config=parallel_config,
+        scheduler_config=scheduler_config,
+        compilation_config=compilation_config,
+    )
+
+
+# ============================================================================
+# Backend Configuration
+# ============================================================================
+
+
+# Backend-specific properties that can't be inferred from the backend class
+# Keys are AttentionBackendEnum names (uppercase)
+_BACKEND_PROPERTIES = {
+    "FLASHMLA": {
+        "query_format": "concat",  # Single concatenated tensor (vs tuple)
+    },
+    "FLASHMLA_SPARSE": {
+        "query_format": "concat",  # Single concatenated tensor (vs tuple)
+    },
+}
+
+
+def _get_backend_config(backend: str) -> dict:
+    """
+    Get backend configuration from AttentionBackendEnum.
+
+    Uses the registry to get the backend class and extract configuration
+    from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).
+
+    Args:
+        backend: Backend name matching AttentionBackendEnum exactly
+        (e.g., "FLASHMLA_SPARSE")
+
+    Returns:
+        Dict with backend configuration
+    """
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+    try:
+        backend_enum = AttentionBackendEnum[backend]
+        backend_class = backend_enum.get_class()
+    except (KeyError, ValueError) as e:
+        valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
+        raise ValueError(
+            f"Unknown backend: {backend}. "
+            f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
+        ) from e
+
+    # Get block size from backend class
+    block_sizes = backend_class.get_supported_kernel_block_sizes()
+    # Use first supported block size (backends typically support one for MLA)
+    block_size = block_sizes[0] if block_sizes else None
+    if hasattr(block_size, "value"):
+        # Handle MultipleOf enum
+        block_size = None
+
+    # Check if sparse via class method if available
+    is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
+
+    # Get properties that can't be inferred
+    props = _BACKEND_PROPERTIES.get(backend, {})
+
+    return {
+        "backend_class": backend_class,
+        "impl_class": backend_class.get_impl_cls(),
+        "builder_class": backend_class.get_builder_cls(),
+        "query_format": props.get("query_format", "tuple"),
+        "block_size": block_size,
+        "is_sparse": is_sparse,
+    }
+
+
+# ============================================================================
+# Metadata Building Helpers
+# ============================================================================
+
+
+def _build_attention_metadata(
+    requests: list,
+    block_size: int,
+    device: torch.device,
+    builder_instance,
+) -> tuple:
+    """
+    Build attention metadata from batch requests.
+
+    Args:
+        requests: List of BatchRequest objects
+        block_size: KV cache block size
+        device: Target device
+        builder_instance: Metadata builder instance
+
+    Returns:
+        Tuple of (metadata, kv_cache_num_blocks)
+    """
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+    max_kv = max(kv_lens)
+
+    # Build query start locations
+    q_start_cpu = torch.tensor(
+        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
+        dtype=torch.int32,
+    )
+    q_start_gpu = q_start_cpu.to(device)
+
+    # Build sequence lengths
+    seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
+    seq_lens_gpu = seq_lens_cpu.to(device)
+
+    # Build num_computed_tokens (context length for each request)
+    context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
+    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
+
+    # Build block table
+    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
+    max_num_blocks = max(num_blocks_per_req)
+
+    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
+    current_block = 0
+    for i, num_blocks in enumerate(num_blocks_per_req):
+        for j in range(num_blocks):
+            block_table_cpu[i, j] = current_block
+            current_block += 1
+
+    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
+
+    # Build slot mapping
+    slot_mapping_list = []
+    for i, (q_len, kv_len, num_blocks) in enumerate(
+        zip(q_lens, kv_lens, num_blocks_per_req)
+    ):
+        context_len = kv_len - q_len
+        for j in range(q_len):
+            token_kv_idx = context_len + j
+            block_idx = token_kv_idx // block_size
+            offset_in_block = token_kv_idx % block_size
+            global_block_id = block_table_cpu[i, block_idx]
+            slot_id = global_block_id * block_size + offset_in_block
+            slot_mapping_list.append(slot_id)
+
+    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
+
+    # Create CommonAttentionMetadata
+    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+
+    common_attn_metadata = CommonAttentionMetadata(
+        num_reqs=len(requests),
+        max_query_len=max(q_lens),
+        max_seq_len=max_kv,
+        num_actual_tokens=total_q,
+        query_start_loc=q_start_gpu,
+        query_start_loc_cpu=q_start_cpu,
+        seq_lens=seq_lens_gpu,
+        _seq_lens_cpu=seq_lens_cpu,
+        _num_computed_tokens_cpu=num_computed_tokens_cpu,
+        slot_mapping=slot_mapping,
+        block_table_tensor=block_table_gpu,
+        dcp_local_seq_lens=None,
+    )
+
+    # Use the production build() method
+    metadata = builder_instance.build(
+        common_prefix_len=0,
+        common_attn_metadata=common_attn_metadata,
+        fast_build=False,
+    )
+
+    return metadata, current_block
+
+
+def _create_input_tensors(
+    total_q: int,
+    mla_dims: dict,
+    query_format: str,
+    device: torch.device,
+    dtype: torch.dtype,
+):
+    """
+    Create input tensors for both decode and prefill modes.
+
+    MLA requires different tensor formats for decode vs prefill:
+    - Decode: Uses kv_lora_rank (512) dimension
+    - Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
+
+    Args:
+        total_q: Total number of query tokens
+        mla_dims: MLA dimension configuration
+        query_format: Either "tuple" or "concat"
+        device: Target device
+        dtype: Tensor dtype
+
+    Returns:
+        Tuple of (decode_inputs, prefill_inputs)
+        - decode_inputs: Query tensor(s) for decode mode
+        - prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
+    """
+    if query_format == "tuple":
+        # Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
+        q_nope_decode = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"],
+            device=device,
+            dtype=dtype,
+        )
+        q_pe = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+        decode_inputs = (q_nope_decode, q_pe)
+
+        # For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
+        q_nope_prefill = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_nope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+        prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
+    else:  # concat
+        decode_inputs = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+        # For prefill with concat format
+        prefill_q = torch.randn(
+            total_q,
+            mla_dims["num_q_heads"],
+            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+            device=device,
+            dtype=dtype,
+        )
+
+    # Create additional inputs needed for prefill forward
+    k_c_normed = torch.randn(
+        total_q,
+        mla_dims["kv_lora_rank"],
+        device=device,
+        dtype=dtype,
+    )
+    k_pe = torch.randn(
+        total_q,
+        1,  # Single head for MLA
+        mla_dims["qk_rope_head_dim"],
+        device=device,
+        dtype=dtype,
+    )
+    k_scale = torch.ones(1, device=device, dtype=torch.float32)
+
+    output = torch.zeros(
+        total_q,
+        mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
+        device=device,
+        dtype=dtype,
+    )
+
+    prefill_inputs = {
+        "q": prefill_q,
+        "k_c_normed": k_c_normed,
+        "k_pe": k_pe,
+        "k_scale": k_scale,
+        "output": output,
+    }
+
+    return decode_inputs, prefill_inputs
+
+
+# ============================================================================
+# Backend Initialization
+# ============================================================================
+
+
+def _create_backend_impl(
+    backend_cfg: dict,
+    mla_dims: dict,
+    vllm_config: VllmConfig,
+    device: torch.device,
+    max_num_tokens: int = 8192,
+    index_topk: int | None = None,
+):
+    """
+    Create backend implementation instance.
+
+    Args:
+        backend_cfg: Backend configuration dict from _get_backend_config()
+        mla_dims: MLA dimension configuration
+        vllm_config: VllmConfig instance
+        device: Target device
+        max_num_tokens: Maximum number of tokens for sparse indexer buffer
+        index_topk: Topk value for sparse MLA backends
+
+    Returns:
+        Tuple of (impl, layer, builder_instance, indexer)
+    """
+    # Get classes from backend config (already resolved by _get_backend_config)
+    impl_class = backend_cfg["impl_class"]
+    builder_class = backend_cfg["builder_class"]
+
+    # Calculate scale
+    scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
+
+    # Create mock kv_b_proj layer for prefill mode
+    mock_kv_b_proj = MockKVBProj(
+        num_heads=mla_dims["num_q_heads"],
+        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
+        v_head_dim=mla_dims["v_head_dim"],
+    )
+
+    # Create indexer for sparse backends
+    indexer = None
+    if backend_cfg.get("is_sparse", False):
+        if index_topk is None:
+            index_topk = 2048  # Default topk for sparse MLA
+        indexer = MockIndexer(
+            max_num_tokens=max_num_tokens,
+            topk_tokens=index_topk,
+            device=device,
+        )
+
+    # Build impl kwargs
+    impl_kwargs = {
+        "num_heads": mla_dims["num_q_heads"],
+        "head_size": mla_dims["head_dim"],
+        "scale": scale,
+        "num_kv_heads": mla_dims["num_kv_heads"],
+        "alibi_slopes": None,
+        "sliding_window": None,
+        "kv_cache_dtype": "auto",
+        "logits_soft_cap": None,
+        "attn_type": "decoder",
+        "kv_sharing_target_layer_name": None,
+        "q_lora_rank": None,
+        "kv_lora_rank": mla_dims["kv_lora_rank"],
+        "qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
+        "qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
+        "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
+        "v_head_dim": mla_dims["v_head_dim"],
+        "kv_b_proj": mock_kv_b_proj,
+    }
+
+    # Add indexer for sparse backends
+    if indexer is not None:
+        impl_kwargs["indexer"] = indexer
+
+    # Create impl
+    impl = impl_class(**impl_kwargs)
+
+    # Initialize DCP attributes
+    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
+        impl.dcp_world_size = 1
+        impl.dcp_rank = 0
+
+    # Create KV cache spec for MockLayer
+    from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+    kv_cache_spec = FullAttentionSpec(
+        block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
+        num_kv_heads=1,  # MLA uses 1 KV head
+        head_size=576,  # MLA head dim
+        dtype=torch.bfloat16,
+    )
+
+    # Create mock layer
+    layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
+
+    # Create builder instance if needed
+    builder_instance = None
+    if builder_class:
+        # Populate static_forward_context so builder can find the layer
+        # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
+        vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
+
+        builder_instance = builder_class(
+            kv_cache_spec=kv_cache_spec,
+            layer_names=["placeholder"],
+            vllm_config=vllm_config,
+            device=device,
+        )
+
+    return impl, layer, builder_instance, indexer
+
+
+# ============================================================================
+# Config Helpers
+# ============================================================================
+
+
+def _extract_mla_dims_from_config(config) -> dict | None:
+    """
+    Extract MLA dimensions from BenchmarkConfig if all required fields are present.
+
+    Args:
+        config: BenchmarkConfig instance
+
+    Returns:
+        Dict with MLA dimensions if all fields are provided, None otherwise
+    """
+    # Check if all MLA-specific fields are provided
+    if all(
+        [
+            config.kv_lora_rank is not None,
+            config.qk_nope_head_dim is not None,
+            config.qk_rope_head_dim is not None,
+            config.v_head_dim is not None,
+        ]
+    ):
+        return {
+            "kv_lora_rank": config.kv_lora_rank,
+            "qk_nope_head_dim": config.qk_nope_head_dim,
+            "qk_rope_head_dim": config.qk_rope_head_dim,
+            "v_head_dim": config.v_head_dim,
+            "num_q_heads": config.num_q_heads,
+            "num_kv_heads": config.num_kv_heads,
+            "head_dim": config.head_dim,
+        }
+    # Fallback: if MLA fields not fully specified, try to construct from basic fields
+    elif config.head_dim == 576:
+        # This looks like a DeepSeek MLA config, use standard dimensions with custom
+        # head count
+        return {
+            "kv_lora_rank": 512,
+            "qk_nope_head_dim": 128,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "num_q_heads": config.num_q_heads,
+            "num_kv_heads": config.num_kv_heads,
+            "head_dim": config.head_dim,
+        }
+    return None
+
+
+# ============================================================================
+# Benchmark Execution
+# ============================================================================
+
+
+def _run_single_benchmark(
+    config,
+    impl,
+    layer,
+    builder_instance,
+    backend_cfg: dict,
+    mla_dims: dict,
+    device: torch.device,
+    indexer=None,
+) -> BenchmarkResult:
+    """
+    Run a single benchmark iteration.
+
+    Args:
+        config: BenchmarkConfig instance
+        impl: Backend implementation instance
+        layer: MockLayer instance
+        builder_instance: Metadata builder instance
+        backend_cfg: Backend configuration dict
+        mla_dims: MLA dimension configuration
+        device: Target device
+        indexer: Optional MockIndexer for sparse backends
+
+    Returns:
+        BenchmarkResult with timing statistics
+    """
+    # Parse batch spec
+    requests = parse_batch_spec(config.batch_spec)
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+    max_kv_len = max(kv_lens)
+
+    # Determine block size
+    block_size = backend_cfg["block_size"] or config.block_size
+
+    # Build metadata
+    metadata, num_blocks = _build_attention_metadata(
+        requests, block_size, device, builder_instance
+    )
+
+    # Create KV cache
+    kv_cache = torch.zeros(
+        num_blocks,
+        block_size,
+        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
+        device=device,
+        dtype=torch.bfloat16,
+    )
+
+    # Create input tensors for both decode and prefill modes
+    decode_inputs, prefill_inputs = _create_input_tensors(
+        total_q,
+        mla_dims,
+        backend_cfg["query_format"],
+        device,
+        torch.bfloat16,
+    )
+
+    # Fill indexer with random indices for sparse backends
+    is_sparse = backend_cfg.get("is_sparse", False)
+    if is_sparse and indexer is not None:
+        indexer.fill_random_indices(total_q, max_kv_len)
+
+    # Determine which forward method to use
+    if is_sparse:
+        # Sparse backends use forward_mqa
+        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
+    elif metadata.decode is not None:
+        forward_fn = lambda: impl._forward_decode(
+            decode_inputs, kv_cache, metadata, layer
+        )
+    elif metadata.prefill is not None:
+        forward_fn = lambda: impl._forward_prefill(
+            prefill_inputs["q"],
+            prefill_inputs["k_c_normed"],
+            prefill_inputs["k_pe"],
+            kv_cache,
+            metadata,
+            prefill_inputs["k_scale"],
+            prefill_inputs["output"],
+        )
+    else:
+        raise RuntimeError("Metadata has neither decode nor prefill metadata")
+
+    # Warmup
+    for _ in range(config.warmup_iters):
+        forward_fn()
+    torch.cuda.synchronize()
+
+    # Benchmark
+    times = []
+    for _ in range(config.repeats):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        for _ in range(config.num_layers):
+            forward_fn()
+        end.record()
+
+        torch.cuda.synchronize()
+        elapsed_ms = start.elapsed_time(end)
+        times.append(elapsed_ms / 1000.0 / config.num_layers)
+
+    mean_time = float(np.mean(times))
+    return BenchmarkResult(
+        config=config,
+        mean_time=mean_time,
+        std_time=float(np.std(times)),
+        min_time=float(np.min(times)),
+        max_time=float(np.max(times)),
+        throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
+    )
+
+
+def _run_mla_benchmark_batched(
+    backend: str,
+    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
+    index_topk: int = 2048,
+) -> list[BenchmarkResult]:
+    """
+    Unified batched MLA benchmark runner for all backends.
+
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
+               flashinfer_mla_sparse, flashmla_sparse
+
+    This function reuses backend initialization across multiple benchmarks
+    to avoid setup/teardown overhead.
+
+    Args:
+        backend: Backend name
+        configs_with_params: List of (config, threshold, num_splits) tuples
+            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
+            - num_splits: num_kv_splits (CUTLASS only)
+        index_topk: Topk value for sparse MLA backends (default 2048)
+
+    Returns:
+        List of BenchmarkResult objects
+    """
+    if not configs_with_params:
+        return []
+
+    backend_cfg = _get_backend_config(backend)
+    device = torch.device(configs_with_params[0][0].device)
+    torch.cuda.set_device(device)
+
+    # Determine block size
+    config_block_size = configs_with_params[0][0].block_size
+    block_size = backend_cfg["block_size"] or config_block_size
+
+    # Extract MLA dimensions from the first config
+    first_config = configs_with_params[0][0]
+    mla_dims = _extract_mla_dims_from_config(first_config)
+
+    # If config didn't provide MLA dims, fall back to default model
+    if mla_dims is None:
+        mla_dims = setup_mla_dims("deepseek-v3")
+
+    # Determine if this is a sparse backend
+    is_sparse = backend_cfg.get("is_sparse", False)
+
+    # Create and set vLLM config for MLA (reused across all benchmarks)
+    vllm_config = create_minimal_vllm_config(
+        model_name="deepseek-v3",  # Used only for model path
+        block_size=block_size,
+        mla_dims=mla_dims,  # Use custom dims from config or default
+        index_topk=index_topk if is_sparse else None,
+    )
+
+    results = []
+
+    with set_current_vllm_config(vllm_config):
+        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
+        impl, layer, builder_instance, indexer = _create_backend_impl(
+            backend_cfg,
+            mla_dims,
+            vllm_config,
+            device,
+            index_topk=index_topk if is_sparse else None,
+        )
+
+        # Run each benchmark with the shared impl
+        for config, threshold, num_splits in configs_with_params:
+            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
+            original_threshold = None
+            if threshold is not None and builder_instance:
+                original_threshold = builder_instance.reorder_batch_threshold
+                builder_instance.reorder_batch_threshold = threshold
+
+            # Set num_splits for CUTLASS
+            original_num_splits = None
+            if num_splits is not None and hasattr(impl, "_num_kv_splits"):
+                original_num_splits = impl._num_kv_splits
+                impl._num_kv_splits = num_splits
+
+            try:
+                result = _run_single_benchmark(
+                    config,
+                    impl,
+                    layer,
+                    builder_instance,
+                    backend_cfg,
+                    mla_dims,
+                    device,
+                    indexer=indexer,
+                )
+                results.append(result)
+
+            finally:
+                # Restore original threshold
+                if original_threshold is not None:
+                    builder_instance.reorder_batch_threshold = original_threshold
+
+                # Restore original num_splits
+                if original_num_splits is not None:
+                    impl._num_kv_splits = original_num_splits
+
+    return results
+
+
+# ============================================================================
+# Public API
+# ============================================================================
+
+
+def run_mla_benchmark(
+    backend: str,
+    config,
+    reorder_batch_threshold: int | None = None,
+    num_kv_splits: int | None = None,
+    index_topk: int = 2048,
+) -> BenchmarkResult | list[BenchmarkResult]:
+    """
+    Unified MLA benchmark runner for all backends.
+
+    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
+               flashinfer_mla_sparse, flashmla_sparse
+
+    Always uses batched execution internally for optimal performance.
+
+    Args:
+        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
+                 flashinfer_mla_sparse, flashmla_sparse)
+        config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
+        reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
+                                 (single config mode only)
+        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
+        index_topk: Topk value for sparse MLA backends (default 2048)
+
+    Returns:
+        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
+    """
+    # Normalize to batched mode: (config, threshold, num_splits)
+    if isinstance(config, list):
+        # Already in batched format
+        if len(config) > 0 and isinstance(config[0], tuple):
+            # Format: [(cfg, param), ...] where param is threshold or num_splits
+            if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
+                configs_with_params = [(cfg, param, None) for cfg, param in config]
+            else:  # cutlass_mla, flashinfer_mla, or sparse backends
+                configs_with_params = [(cfg, None, param) for cfg, param in config]
+        else:
+            # Format: [cfg, ...] - just configs
+            configs_with_params = [(cfg, None, None) for cfg in config]
+        return_single = False
+    else:
+        # Single config: convert to batched format
+        configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
+        return_single = True
+
+    # Use unified batched execution
+    results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
+
+    # Return single result or list based on input
+    return results[0] if return_single else results
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
+
+This module provides helpers for running standard attention backends
+(FlashAttention, Triton, FlashInfer) with real vLLM integration.
+"""
+
+import logging
+import types
+from contextlib import contextmanager
+
+import numpy as np
+import torch
+from batch_spec import parse_batch_spec, reorder_for_flashinfer
+from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
+
+from vllm.config import (
+    CacheConfig,
+    CompilationConfig,
+    DeviceConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
+from vllm.v1.attention.backends.utils import (
+    CommonAttentionMetadata,
+    get_kv_cache_layout,
+    set_kv_cache_layout,
+)
+from vllm.v1.kv_cache_interface import FullAttentionSpec
+
+# ============================================================================
+# Backend Configuration
+# ============================================================================
+
+
+def _get_backend_config(backend: str) -> dict:
+    """
+    Get backend configuration from AttentionBackendEnum.
+
+    Args:
+        backend: Backend name matching AttentionBackendEnum exactly
+                 (e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER")
+
+    Returns:
+        Dict with backend_class
+    """
+    from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+    try:
+        backend_enum = AttentionBackendEnum[backend]
+        backend_class = backend_enum.get_class()
+    except (KeyError, ValueError) as e:
+        valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"]
+        raise ValueError(
+            f"Unknown backend: {backend}. Valid backends: {valid_backends}"
+        ) from e
+
+    return {"backend_class": backend_class}
+
+
+@contextmanager
+def log_warnings_and_errors_only():
+    """Temporarily set vLLM logger to WARNING level."""
+    logger = logging.getLogger("vllm")
+    old_level = logger.level
+    logger.setLevel(logging.WARNING)
+    try:
+        yield
+    finally:
+        logger.setLevel(old_level)
+
+
+# ============================================================================
+# Metadata Building Helpers
+# ============================================================================
+
+
+def _build_common_attn_metadata(
+    q_lens: list[int],
+    kv_lens: list[int],
+    block_size: int,
+    device: torch.device,
+) -> CommonAttentionMetadata:
+    """Build CommonAttentionMetadata from query/kv lengths."""
+    batch_size = len(q_lens)
+    total_tokens = sum(q_lens)
+
+    query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
+    query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
+        0
+    )
+    query_start_loc_cpu = query_start_loc.cpu()
+
+    seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
+    max_seq_len = int(seq_lens.max().item())
+
+    max_blocks = (max(kv_lens) + block_size - 1) // block_size
+    num_blocks = batch_size * max_blocks
+    block_table_tensor = torch.arange(
+        num_blocks, dtype=torch.int32, device=device
+    ).view(batch_size, max_blocks)
+    slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
+
+    max_query_len = max(q_lens)
+
+    return CommonAttentionMetadata(
+        query_start_loc=query_start_loc,
+        query_start_loc_cpu=query_start_loc_cpu,
+        seq_lens=seq_lens,
+        num_reqs=batch_size,
+        num_actual_tokens=total_tokens,
+        max_query_len=max_query_len,
+        max_seq_len=max_seq_len,
+        block_table_tensor=block_table_tensor,
+        slot_mapping=slot_mapping,
+        causal=True,
+    )
+
+
+def _create_vllm_config(
+    config: BenchmarkConfig,
+    max_num_blocks: int,
+) -> VllmConfig:
+    """Create a VllmConfig for benchmarking with mock model methods."""
+    model_config = ModelConfig(
+        model="meta-llama/Meta-Llama-3-8B",
+        tokenizer="meta-llama/Meta-Llama-3-8B",
+        trust_remote_code=False,
+        dtype="auto",  # Use model's native dtype
+        seed=0,
+        max_model_len=1024,
+    )
+
+    cache_config = CacheConfig(
+        block_size=config.block_size,
+        cache_dtype="auto",
+        swap_space=0,
+    )
+    cache_config.num_gpu_blocks = max_num_blocks
+    cache_config.num_cpu_blocks = 0
+
+    parallel_config = ParallelConfig(tensor_parallel_size=1)
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=256,
+        max_num_batched_tokens=8192,
+        max_model_len=8192,
+        is_encoder_decoder=False,
+        enable_chunked_prefill=True,
+    )
+    device_config = DeviceConfig()
+    load_config = LoadConfig()
+    compilation_config = CompilationConfig()
+
+    # Add mock methods for benchmark config values
+    model_config.get_num_layers = types.MethodType(
+        lambda self: config.num_layers, model_config
+    )
+    model_config.get_sliding_window_for_layer = types.MethodType(
+        lambda self, i: None, model_config
+    )
+    model_config.get_logits_soft_cap_for_layer = types.MethodType(
+        lambda self, i: 0.0, model_config
+    )
+    model_config.get_sm_scale_for_layer = types.MethodType(
+        lambda self, i: 1.0 / config.head_dim**0.5, model_config
+    )
+    model_config.get_num_attention_heads = types.MethodType(
+        lambda self, parallel_config=None: config.num_q_heads, model_config
+    )
+    model_config.get_num_kv_heads = types.MethodType(
+        lambda self, parallel_config=None: config.num_kv_heads, model_config
+    )
+    model_config.get_head_size = types.MethodType(
+        lambda self: config.head_dim, model_config
+    )
+    model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
+
+    return VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        parallel_config=parallel_config,
+        scheduler_config=scheduler_config,
+        device_config=device_config,
+        load_config=load_config,
+        compilation_config=compilation_config,
+    )
+
+
+# ============================================================================
+# Backend Initialization
+# ============================================================================
+
+
+def _create_backend_impl(
+    backend_cfg: dict,
+    config: BenchmarkConfig,
+    device: torch.device,
+    dtype: torch.dtype,
+):
+    """Create backend implementation instance."""
+    backend_class = backend_cfg["backend_class"]
+
+    scale = get_attention_scale(config.head_dim)
+
+    impl = backend_class.get_impl_cls()(
+        num_heads=config.num_q_heads,
+        head_size=config.head_dim,
+        scale=scale,
+        num_kv_heads=config.num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=None,
+        kv_cache_dtype="auto",
+    )
+
+    kv_cache_spec = FullAttentionSpec(
+        block_size=config.block_size,
+        num_kv_heads=config.num_kv_heads,
+        head_size=config.head_dim,
+        dtype=dtype,
+    )
+
+    layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
+
+    return backend_class, impl, layer
+
+
+def _create_metadata_builder(
+    backend_class,
+    kv_cache_spec: FullAttentionSpec,
+    vllm_config: VllmConfig,
+    device: torch.device,
+    backend_name: str = "",
+):
+    """Create metadata builder instance."""
+    layer_names = ["layer_0"]
+    builder_cls = backend_class.get_builder_cls()
+
+    # Flashinfer needs get_per_layer_parameters mocked since we don't have
+    # real model layers registered
+    if backend_name == "FLASHINFER":
+        import unittest.mock
+
+        from vllm.v1.attention.backends.utils import PerLayerParameters
+
+        def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
+            head_size = vllm_config.model_config.get_head_size()
+            return {
+                layer_name: PerLayerParameters(
+                    window_left=-1,  # No sliding window
+                    logits_soft_cap=0.0,  # No soft cap
+                    sm_scale=1.0 / (head_size**0.5),  # Standard scale
+                )
+                for layer_name in layer_names
+            }
+
+        with unittest.mock.patch(
+            "vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
+            mock_get_per_layer_parameters,
+        ):
+            return builder_cls(
+                kv_cache_spec=kv_cache_spec,
+                layer_names=layer_names,
+                vllm_config=vllm_config,
+                device=device,
+            )
+
+    return builder_cls(
+        kv_cache_spec=kv_cache_spec,
+        layer_names=layer_names,
+        vllm_config=vllm_config,
+        device=device,
+    )
+
+
+# ============================================================================
+# Tensor Creation Helpers
+# ============================================================================
+
+
+def _create_input_tensors(
+    config: BenchmarkConfig,
+    total_q: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> tuple:
+    """Create Q, K, V input tensors for all layers."""
+    q_list = [
+        torch.randn(
+            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
+        )
+        for _ in range(config.num_layers)
+    ]
+    k_list = [
+        torch.randn(
+            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
+        )
+        for _ in range(config.num_layers)
+    ]
+    v_list = [
+        torch.randn(
+            total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
+        )
+        for _ in range(config.num_layers)
+    ]
+    return q_list, k_list, v_list
+
+
+def _create_kv_cache(
+    config: BenchmarkConfig,
+    max_num_blocks: int,
+    backend_class,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> list:
+    """Create KV cache tensors for all layers using the backend's methods.
+
+    Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order()
+    to create the cache with the correct shape and memory layout.
+    """
+    # Get the logical shape from the backend
+    cache_shape = backend_class.get_kv_cache_shape(
+        num_blocks=max_num_blocks,
+        block_size=config.block_size,
+        num_kv_heads=config.num_kv_heads,
+        head_size=config.head_dim,
+    )
+
+    # Get the stride order for custom memory layout
+    try:
+        stride_order = backend_class.get_kv_cache_stride_order()
+        assert len(stride_order) == len(cache_shape)
+    except (AttributeError, NotImplementedError):
+        stride_order = tuple(range(len(cache_shape)))
+
+    # Permute shape to physical layout order
+    physical_shape = tuple(cache_shape[i] for i in stride_order)
+
+    # Compute inverse permutation to get back to logical view
+    inv_order = [stride_order.index(i) for i in range(len(stride_order))]
+
+    cache_list = []
+    for _ in range(config.num_layers):
+        # Allocate in physical layout order (contiguous in memory)
+        cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
+        # Permute to logical view
+        cache = cache.permute(*inv_order)
+        cache_list.append(cache)
+
+    return cache_list
+
+
+# ============================================================================
+# Benchmark Execution
+# ============================================================================
+
+
+def _run_single_benchmark(
+    config: BenchmarkConfig,
+    impl,
+    layer,
+    q_list: list,
+    k_list: list,
+    v_list: list,
+    cache_list: list,
+    attn_metadata,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> tuple:
+    """Run single benchmark iteration with warmup and timing loop."""
+    total_q = q_list[0].shape[0]
+    out = torch.empty(
+        total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
+    )
+
+    # Warmup
+    for _ in range(config.warmup_iters):
+        for i in range(config.num_layers):
+            impl.forward(
+                layer,
+                q_list[i],
+                k_list[i],
+                v_list[i],
+                cache_list[i],
+                attn_metadata,
+                output=out,
+            )
+    torch.cuda.synchronize()
+
+    # Benchmark
+    times = []
+    for _ in range(config.repeats):
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+
+        start.record()
+        for i in range(config.num_layers):
+            impl.forward(
+                layer,
+                q_list[i],
+                k_list[i],
+                v_list[i],
+                cache_list[i],
+                attn_metadata,
+                output=out,
+            )
+        end.record()
+
+        torch.cuda.synchronize()
+        elapsed_ms = start.elapsed_time(end)
+        times.append(elapsed_ms / 1000.0 / config.num_layers)  # seconds per layer
+
+    mem_stats = {}
+    if config.profile_memory:
+        mem_stats = {
+            "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
+            "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
+        }
+
+    return times, mem_stats
+
+
+# ============================================================================
+# Public API
+# ============================================================================
+
+
+def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
+    """
+    Run standard attention benchmark with real kernels.
+
+    Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER
+
+    Args:
+        config: Benchmark configuration
+
+    Returns:
+        BenchmarkResult with timing and memory statistics
+    """
+    device = torch.device(config.device)
+    torch.cuda.set_device(device)
+
+    backend_cfg = _get_backend_config(config.backend)
+
+    requests = parse_batch_spec(config.batch_spec)
+
+    if config.backend == "FLASHINFER":
+        requests = reorder_for_flashinfer(requests)
+
+    q_lens = [r.q_len for r in requests]
+    kv_lens = [r.kv_len for r in requests]
+    total_q = sum(q_lens)
+    max_kv = max(kv_lens)
+    batch_size = len(q_lens)
+
+    # Calculate total blocks needed: batch_size * max_blocks_per_request
+    max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size
+    max_num_blocks = batch_size * max_blocks_per_request
+
+    # Suppress vLLM logs during setup to reduce spam
+    with log_warnings_and_errors_only():
+        # Create vllm_config first - uses model's native dtype via "auto"
+        vllm_config = _create_vllm_config(config, max_num_blocks)
+        dtype = vllm_config.model_config.dtype
+
+        # Wrap everything in set_current_vllm_config context
+        # This is required for backends like flashinfer that need global config
+        with set_current_vllm_config(vllm_config):
+            backend_class, impl, layer = _create_backend_impl(
+                backend_cfg, config, device, dtype
+            )
+
+            # Set KV cache layout if the backend requires a specific one
+            # (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention)
+            required_layout = backend_class.get_required_kv_cache_layout()
+            if required_layout is not None:
+                set_kv_cache_layout(required_layout)
+                get_kv_cache_layout.cache_clear()
+
+            common_metadata = _build_common_attn_metadata(
+                q_lens, kv_lens, config.block_size, device
+            )
+
+            kv_cache_spec = FullAttentionSpec(
+                block_size=config.block_size,
+                num_kv_heads=config.num_kv_heads,
+                head_size=config.head_dim,
+                dtype=dtype,
+            )
+
+            builder = _create_metadata_builder(
+                backend_class, kv_cache_spec, vllm_config, device, config.backend
+            )
+
+            attn_metadata = builder.build(
+                common_prefix_len=0,
+                common_attn_metadata=common_metadata,
+            )
+
+            q_list, k_list, v_list = _create_input_tensors(
+                config, total_q, device, dtype
+            )
+
+            cache_list = _create_kv_cache(
+                config, max_num_blocks, backend_class, device, dtype
+            )
+
+            times, mem_stats = _run_single_benchmark(
+                config,
+                impl,
+                layer,
+                q_list,
+                k_list,
+                v_list,
+                cache_list,
+                attn_metadata,
+                device,
+                dtype,
+            )
+
+    mean_time = np.mean(times)
+    throughput = total_q / mean_time if mean_time > 0 else 0
+
+    return BenchmarkResult(
+        config=config,
+        mean_time=mean_time,
+        std_time=np.std(times),
+        min_time=np.min(times),
+        max_time=np.max(times),
+        throughput_tokens_per_sec=throughput,
+        memory_allocated_mb=mem_stats.get("allocated_mb"),
+        memory_reserved_mb=mem_stats.get("reserved_mb"),
+    )
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
+# Automated vLLM Server Parameter Tuning
+
+This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
+
+## Table of Contents
+
+- [Prerequisites](#prerequisites)
+- [Configuration](#configuration)
+- [How to Run](#how-to-run)
+- [Example Use Cases](#example-use-cases)
+- [Output](#output)
+- [How It Works](#how-it-works)
+
+## Prerequisites
+
+Before running the script, please ensure the following steps are completed:
+
+1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch.
+
+```bash
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+# git checkout <your-branch>
+```
+
+1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions.
+
+2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible.
+
+## Configuration
+
+You must set the following variables at the top of the script before execution.
+
+   Note: You can also override the default values below via environment variables when running the script.
+
+```bash
+MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
+```
+
+| Variable | Description | Example Value |
+| --- | --- | --- |
+| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
+| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
+| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
+| `TP` | **Required.** The tensor-parallelism size. | `1` |
+| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
+| `INPUT_LEN` | **Required.** Request input length. | `4000` |
+| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
+| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
+| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
+| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
+| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
+| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` |
+
+**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`.
+
+## How to Run
+
+1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
+2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
+
+```bash
+cd <FOLDER_OF_THIS_SCRIPT>
+bash auto_tune.sh
+```
+
+    Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself.
+
+## Example Use Cases
+
+Here are a few examples of how to configure the script for different goals:
+
+### 1. Maximize Throughput (No Latency Constraint)
+
+- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
+- **Configuration**:
+
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
+```
+
+### 2. Maximize Throughput with a Latency Requirement
+
+- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
+- **Configuration**:
+
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
+MIN_CACHE_HIT_PCT=0
+MAX_LATENCY_ALLOWED_MS=500
+```
+
+### 3. Maximize Throughput with Prefix Caching and Latency Requirements
+
+- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
+- **Configuration**:
+
+```bash
+INPUT_LEN=1800
+OUTPUT_LEN=20
+MAX_MODEL_LEN=2048
+MIN_CACHE_HIT_PCT=60
+MAX_LATENCY_ALLOWED_MS=500
+```
+
+## Output
+
+After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`.
+
+- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
+    - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
+    - `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
+
+- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
+
+```text
+# Example result.txt content
+hash:a1b2c3d4...
+max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
+max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500
+...
+best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile
+```
+
+  If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict.
+
+- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run.
+
+## How It Works
+
+The script follows a systematic process to find the optimal parameters:
+
+1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing.
+
+2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists.
+
+3. **Latency-Aware Throughput Search**: For each parameter combination:
+    - The vLLM server is started.
+    - A benchmark is first run with an infinite request rate (`--request-rate inf`).
+    - If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration.
+    - If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement.
+
+4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
+
+5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
+
+## Batched `auto_tune`
+
+The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
+
+### Prerequisites
+
+- **jq**: This script requires `jq` to parse the JSON configuration file.
+- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
+
+### How to Run
+
+1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
+
+2. **Execute the script**:
+
+    ```bash
+    bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
+    ```
+
+    - `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
+    - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
+
+### Configuration File
+
+The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
+
+Here is an example `runs_config.json` with two benchmark configurations:
+
+```json
+[
+  {
+    "base": "/home/user",
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "system": "TPU", # OR GPU
+    "tp": 8,
+    "input_len": 128,
+    "output_len": 2048,
+    "max_model_len": 2300,
+    "num_seqs_list": "128 256",
+    "num_batched_tokens_list": "8192 16384"
+  },
+  {
+    "base": "/home/user",
+    "model": "meta-llama/Llama-3.1-70B-Instruct",
+    "system": "TPU", # OR GPU
+    "tp": 8,
+    "input_len": 4000,
+    "output_len": 16,
+    "max_model_len": 4096,
+    "num_seqs_list": "64 128",
+    "num_batched_tokens_list": "4096 8192",
+    "max_latency_allowed_ms": 500
+  }
+]
+```
+
+### Output
+
+The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
+
+- `run_id`: A unique identifier for the run, derived from the timestamp.
+- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
+- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
+- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
+
+A summary of successful and failed runs is also printed to the console upon completion.
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
+#!/bin/bash
+
+# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
+# See details in README (benchmarks/auto_tune/README.md).
+
+TAG=$(date +"%Y_%m_%d_%H_%M")
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+BASE=${BASE:-"$SCRIPT_DIR/../../.."}
+MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
+SYSTEM=${SYSTEM:-"TPU"}
+TP=${TP:-1}
+DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
+INPUT_LEN=${INPUT_LEN:-4000}
+OUTPUT_LEN=${OUTPUT_LEN:-16}
+MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
+MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
+MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
+NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
+HOSTNAME=$(hostname)
+if [[ -z "$HOSTNAME" ]]; then
+    echo "Error: Failed to determine hostname." >&2
+    exit 1
+fi
+
+LOG_FOLDER="$BASE/auto-benchmark/$TAG"
+RESULT="$LOG_FOLDER/result.txt"
+PROFILE_PATH="$LOG_FOLDER/profile"
+
+echo "====================== AUTO TUNE PARAMETERS ===================="
+echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "BASE=$BASE"
+echo "MODEL=$MODEL"
+echo "SYSTEM=$SYSTEM"
+echo "TP=$TP"
+echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
+echo "INPUT_LEN=$INPUT_LEN"
+echo "OUTPUT_LEN=$OUTPUT_LEN"
+echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
+echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
+echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
+echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
+echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
+echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
+echo "RESULT_FILE=$RESULT"
+echo "====================== AUTO TUNEPARAMETERS ===================="
+
+rm -rf "$LOG_FOLDER"
+rm -rf "$PROFILE_PATH"
+mkdir -p "$LOG_FOLDER"
+mkdir -p "$PROFILE_PATH"
+
+cd "$BASE/vllm"
+
+pip install -q datasets
+
+current_hash=$(git rev-parse HEAD)
+echo "hash:$current_hash" >> "$RESULT"
+echo "current_hash: $current_hash"
+
+TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
+RED='\033[0;31m'
+if (( TOTAL_LEN > MAX_MODEL_LEN )); then
+    echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
+    exit 1
+fi
+
+best_throughput=0
+best_max_num_seqs=0
+best_num_batched_tokens=0
+best_goodput=0
+best_request_rate=0
+
+start_server() {
+    local gpu_memory_utilization=$1
+    local max_num_seqs=$2
+    local max_num_batched_tokens=$3
+    local vllm_log=$4
+    local profile_dir=$5
+
+    pkill -if "vllm serve" || true
+
+    # Define the common arguments as a bash array.
+    # Each argument and its value are separate elements.
+    local common_args_array=(
+        "$MODEL"
+        "--port" "8004"
+        "--host" "$HOSTNAME"
+        "--gpu-memory-utilization" "$gpu_memory_utilization"
+        "--max-num-seqs" "$max_num_seqs"
+        "--max-num-batched-tokens" "$max_num_batched_tokens"
+        "--tensor-parallel-size" "$TP"
+        "--enable-prefix-caching"
+        "--load-format" "dummy"
+        "--download-dir" "$DOWNLOAD_DIR"
+        "--max-model-len" "$MAX_MODEL_LEN"
+    )
+
+    # Use the array expansion "${common_args_array[@]}"
+    # This correctly passes each element as a separate argument.
+    if [[ -n "$profile_dir" ]]; then
+        # Start server with profiling enabled
+        local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
+        VLLM_SERVER_DEV_MODE=1 \
+            vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+    else
+        # Start server without profiling
+        VLLM_SERVER_DEV_MODE=1 \
+            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
+    fi
+    local server_pid=$!
+
+    # wait for 10 minutes...
+    server_started=0
+    for _ in {1..60}; do
+        # This line checks whether the server is still alive or not,
+        # since that we should always have permission to send signal to the server process.
+        kill -0 $server_pid 2> /dev/null || break
+
+        RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
+        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
+        if [[ "$STATUS_CODE" -eq 200 ]]; then
+            server_started=1
+            break
+        else
+            sleep 10
+        fi
+    done
+
+    if (( ! server_started )); then
+        echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
+        return 1
+    else
+        return 0
+    fi
+}
+
+run_benchmark() {
+    local max_num_seqs=$1
+    local max_num_batched_tokens=$2
+    local gpu_memory_utilization=$3
+    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
+    echo "vllm_log: $vllm_log"
+    echo
+    rm -f "$vllm_log"
+    pkill -if "vllm serve" || true
+
+    echo "starting server..."
+    # Call start_server without a profile_dir to avoid profiling overhead
+    start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
+    result=$?
+    if [[ "$result" -eq 1 ]]; then
+        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
+    else
+        echo "server started."
+    fi
+    echo
+
+    echo "run benchmark test..."
+    meet_latency_requirement=0
+    # get a basic qps by using request-rate inf
+    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
+    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
+    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    # --profile flag is removed from this call
+    vllm bench serve \
+        --backend vllm \
+        --model "$MODEL"  \
+        --dataset-name random \
+        --random-input-len $adjusted_input_len \
+        --random-output-len "$OUTPUT_LEN" \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate inf \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
+        --num-prompts 1000 \
+        --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
+        --port 8004 &> "$bm_log"
+    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+
+    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+        meet_latency_requirement=1
+        request_rate=inf
+    fi
+
+    if (( ! meet_latency_requirement )); then
+    # start from request-rate as int(throughput) + 1
+        request_rate=$((${throughput%.*} + 1))
+        while ((request_rate > 0)); do
+            # clear prefix cache
+            curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
+            sleep 5
+            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
+            vllm bench serve \
+                --backend vllm \
+                --model "$MODEL"  \
+                --dataset-name random \
+                --random-input-len $adjusted_input_len \
+                --random-output-len "$OUTPUT_LEN" \
+                --ignore-eos \
+                --disable-tqdm \
+                --request-rate $request_rate \
+                --percentile-metrics ttft,tpot,itl,e2el \
+                --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
+                --num-prompts 100 \
+                --random-prefix-len $prefix_len \
+                --host "$HOSTNAME" \
+                --port 8004 &> "$bm_log"
+            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
+            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
+            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
+                meet_latency_requirement=1
+                break
+            fi
+            request_rate=$((request_rate-1))
+        done
+    fi
+    # write the results and update the best result.
+    if ((meet_latency_requirement)); then
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
+        if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
+            best_throughput=$throughput
+            best_max_num_seqs=$max_num_seqs
+            best_num_batched_tokens=$max_num_batched_tokens
+            best_goodput=$goodput
+            best_request_rate=$request_rate
+        fi
+    else
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
+        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
+    fi
+
+    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
+
+    pkill -if "vllm serve" || true
+    sleep 10
+    echo "===================="
+    return 0
+}
+
+read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
+read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
+
+# first find out the max gpu-memory-utilization without HBM OOM.
+gpu_memory_utilization=0.98
+find_gpu_memory_utilization=0
+while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
+    # Pass empty string for profile_dir argument
+    start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
+    result=$?
+    if [[ "$result" -eq 0 ]]; then
+        find_gpu_memory_utilization=1
+        break
+    else
+        gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
+    fi
+done
+
+if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
+    echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
+else
+    echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
+    exit 1
+fi
+
+for num_seqs in "${num_seqs_list[@]}"; do
+    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
+        run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
+    done
+done
+echo "finish permutations"
+
+# =================================================================================
+# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
+# =================================================================================
+if (( $(echo "$best_throughput > 0" | bc -l) )); then
+    echo
+    echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
+    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
+    echo
+
+    vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
+    bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
+
+    # Start server with the best params and profiling ENABLED
+    echo "Starting server for profiling..."
+    start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
+
+    # Run benchmark with the best params and the --profile flag
+    echo "Running benchmark with profiling..."
+    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
+    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
+    vllm bench serve \
+        --backend vllm \
+        --model "$MODEL" \
+        --dataset-name random \
+        --random-input-len $adjusted_input_len \
+        --random-output-len "$OUTPUT_LEN" \
+        --ignore-eos \
+        --disable-tqdm \
+        --request-rate "$best_request_rate" \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
+        --num-prompts 100 \
+        --random-prefix-len $prefix_len \
+        --host "$HOSTNAME" \
+        --port 8004 \
+        --profile &> "$bm_log"
+else
+    echo "No configuration met the latency requirements. Skipping final profiling run."
+fi
+pkill -if "vllm serve" || true
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
+echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
--- a/benchmarks/auto_tune/batch_auto_tune.sh
+++ b/benchmarks/auto_tune/batch_auto_tune.sh
+#!/bin/bash
+
+INPUT_JSON="$1"
+GCS_PATH="$2" # Optional GCS path for uploading results for each run
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
+
+if [[ -z "$INPUT_JSON" ]]; then
+  echo "Error: Input JSON file not provided."
+  echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
+  exit 1
+fi
+
+if [[ ! -f "$INPUT_JSON" ]]; then
+  echo "Error: File not found at '$INPUT_JSON'"
+  exit 1
+fi
+
+if ! command -v jq &> /dev/null; then
+    echo "Error: 'jq' command not found. Please install jq to process the JSON input."
+    exit 1
+fi
+
+if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
+    echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
+    exit 1
+fi
+
+SUCCESS_COUNT=0
+FAILURE_COUNT=0
+FAILED_RUNS=()
+SCRIPT_START_TIME=$(date +%s)
+
+json_content=$(cat "$INPUT_JSON")
+if ! num_runs=$(echo "$json_content" | jq 'length'); then
+  echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
+  exit 1
+fi
+
+echo "Found $num_runs benchmark configurations in $INPUT_JSON."
+echo "Starting benchmark runs..."
+echo "--------------------------------------------------"
+
+for i in $(seq 0 $(($num_runs - 1))); do
+  run_object=$(echo "$json_content" | jq ".[$i]")
+
+  RUN_START_TIME=$(date +%s)
+  ENV_VARS_ARRAY=()
+  # Dynamically create env vars from the JSON object's keys
+  for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
+    value=$(echo "$run_object" | jq -r ".$key")
+    var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
+    ENV_VARS_ARRAY+=("${var_name}=${value}")
+  done
+
+  echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
+
+  # Execute auto_tune.sh and capture output
+  RUN_OUTPUT_FILE=$(mktemp)
+  if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
+    STATUS="SUCCESS"
+    ((SUCCESS_COUNT++))
+  else
+    STATUS="FAILURE"
+    ((FAILURE_COUNT++))
+    FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
+  fi
+
+  RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
+  rm "$RUN_OUTPUT_FILE"
+
+  # Parse results and optionally upload them to GCS
+  RUN_ID=""
+  RESULTS=""
+  GCS_RESULTS_URL=""
+  if [[ "$STATUS" == "SUCCESS" ]]; then
+    RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
+
+    if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
+      RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
+      RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
+      RESULTS=$(cat "$RESULT_FILE_PATH")
+
+      if [[ -n "$GCS_PATH" ]]; then
+        GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
+        echo "Uploading results to GCS..."
+        if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
+          echo "GCS upload successful."
+        else
+          echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
+        fi
+      fi
+    else
+      echo "Warning: Could not find result file for a successful run."
+      STATUS="WARNING_NO_RESULT_FILE"
+    fi
+  fi
+
+  # Add the results back into the JSON object for this run
+  json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
+    '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
+
+  RUN_END_TIME=$(date +%s)
+  echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
+  echo "--------------------------------------------------"
+
+  # Save intermediate progress back to the file
+  echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
+
+done
+
+SCRIPT_END_TIME=$(date +%s)
+echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
+echo
+echo "====================== SUMMARY ======================"
+echo "Successful runs: $SUCCESS_COUNT"
+echo "Failed runs:     $FAILURE_COUNT"
+echo "==================================================="
+
+if [[ $FAILURE_COUNT -gt 0 ]]; then
+  echo "Details of failed runs (see JSON file for full parameters):"
+  for failed in "${FAILED_RUNS[@]}"; do
+    echo "  - $failed"
+  done
+fi
+
+echo "Updated results have been saved to '$INPUT_JSON'."
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+
+import aiohttp
+import huggingface_hub.constants
+from tqdm.asyncio import tqdm
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: str | None = None
+    logprobs: int | None = None
+    extra_body: dict | None = None
+    multi_modal_content: dict | list[dict] | None = None
+    ignore_eos: bool = False
+    language: str | None = None
+    request_id: str | None = None
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    itl: list[float] = field(default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        params = {
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            "ignore_eos_token": request_func_input.ignore_eos,
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        headers = None
+        if request_func_input.request_id:
+            headers = {"x-request-id": request_func_input.request_id}
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+
+                        # NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = chunk_bytes.removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
+        headers = None
+        if request_func_input.request_id:
+            headers = {"x-request-id": request_func_input.request_id}
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "model": request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0]["text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = (
+                            "Unexpected response format: "
+                            "neither 'choices' nor 'text' found"
+                        )
+                        output.success = False
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "repetition_penalty": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            if usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!"
+                        )
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("chat/completions", "profile")), (
+        "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    )
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            mm_content = request_func_input.multi_modal_content
+            if isinstance(mm_content, list):
+                content.extend(mm_content)
+            elif isinstance(mm_content, dict):
+                content.append(mm_content)
+            else:
+                raise TypeError(
+                    "multi_modal_content must be a dict or list[dict] for openai-chat"
+                )
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "messages": [
+                {"role": "user", "content": content},
+            ],
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+                        # NOTE: SSE comments (often used as pings) start with a colon.
+                        # These are not JSON data payload and should be skipped.
+                        if chunk_bytes.startswith(":"):
+                            continue
+
+                        chunk = chunk_bytes.removeprefix("data: ")
+
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: tqdm | None = None,
+) -> RequestFuncOutput:
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("transcriptions", "translations")), (
+        "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    )
+    "or `translations`."
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True,
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+        if request_func_input.request_id:
+            headers["x-request-id"] = request_func_input.request_id
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        mm_audio = request_func_input.multi_modal_content
+        if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
+            raise TypeError("multi_modal_content must be a dict containing 'audio'")
+        with to_bytes(*mm_audio["audio"]) as f:
+            form = aiohttp.FormData()
+            form.add_field("file", f, content_type="audio/wav")
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(
+                    url=api_url, data=form, headers=headers
+                ) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get("content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(
+                                            timestamp - most_recent_timestamp
+                                        )
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get(
+                                        "completion_tokens"
+                                    )
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
+        from modelscope import snapshot_download
+
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(pretrained_model_name_or_path):
+            model_path = snapshot_download(
+                model_id=pretrained_model_name_or_path,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+            )
+
+            return model_path
+    return pretrained_model_name_or_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
+) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+        pretrained_model_name_or_path
+    ):
+        pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    if tokenizer_mode == "mistral":
+        try:
+            from vllm.tokenizers.mistral import MistralTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "MistralTokenizer requires vllm package.\n"
+                "Please install it with `pip install vllm` "
+                "to use mistral tokenizer mode."
+            ) from e
+        return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
+    else:
+        return AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
+    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
+    "llama.cpp": async_request_openai_completions,
+}
--- a/benchmarks/benchmark_batch_invariance.py
+++ b/benchmarks/benchmark_batch_invariance.py
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
+
+This benchmark runs the same workload twice:
+1. With VLLM_BATCH_INVARIANT=0 (baseline)
+2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
+
+And reports the timing and throughput metrics for comparison.
+
+Environment variables:
+    VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
+    VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
+    VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
+    VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
+    VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
+    VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
+    VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
+    VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
+    VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
+    VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
+    VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
+
+Example usage:
+    # Benchmark qwen3 (default)
+    python benchmarks/benchmark_batch_invariance.py
+
+    # Benchmark deepseek with 8 GPUs
+    VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
+        python benchmarks/benchmark_batch_invariance.py
+
+    # Quick test with fewer trials
+    VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
+        python benchmarks/benchmark_batch_invariance.py
+"""
+
+import contextlib
+import os
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    """Generate a random prompt for benchmarking."""
+    prompt_templates = [
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def run_benchmark_with_batch_invariant(
+    model: str,
+    tp_size: int,
+    max_batch_size: int,
+    num_trials: int,
+    min_prompt: int,
+    max_prompt: int,
+    max_tokens: int,
+    temperature: float,
+    gpu_mem_util: float,
+    max_model_len: int,
+    backend: str,
+    batch_invariant: bool,
+    seed: int = 12345,
+) -> dict:
+    """
+    Run the benchmark with the specified configuration.
+
+    Returns a dict with timing and throughput metrics.
+    """
+    random.seed(seed)
+
+    # Set environment variables
+    if batch_invariant:
+        os.environ["VLLM_BATCH_INVARIANT"] = "1"
+    else:
+        os.environ["VLLM_BATCH_INVARIANT"] = "0"
+
+    print(f"\n{'=' * 80}")
+    print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
+    print(f"  Model: {model}")
+    print(f"  TP Size: {tp_size}")
+    print(f"  Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Trials: {num_trials}")
+    print(f"  Max Tokens: {max_tokens}")
+    print(f"{'=' * 80}\n")
+
+    sampling = SamplingParams(
+        temperature=temperature,
+        top_p=0.95,
+        max_tokens=max_tokens,
+        seed=20240919,
+    )
+
+    needle_prompt = "There once was a "
+
+    llm = None
+    try:
+        # Create LLM engine
+        start_init = time.perf_counter()
+        llm = LLM(
+            model=model,
+            max_num_seqs=max_batch_size,
+            gpu_memory_utilization=gpu_mem_util,
+            max_model_len=max_model_len,
+            dtype="bfloat16",
+            tensor_parallel_size=tp_size,
+            attention_config={"backend": backend},
+            enable_prefix_caching=False,
+        )
+        init_time = time.perf_counter() - start_init
+        print(f"Engine initialization time: {init_time:.2f}s\n")
+
+        # Generate baseline
+        print("Generating baseline (warmup)...")
+        baseline_out = llm.generate([needle_prompt], sampling)
+        assert len(baseline_out) == 1
+        baseline_text = baseline_out[0].outputs[0].text
+        print(f"Baseline output: '{baseline_text[:50]}...'\n")
+
+        # Run trials and measure timing
+        trial_times: list[float] = []
+        total_tokens = 0
+        total_prompts = 0
+
+        for trial in range(num_trials):
+            # Create a batch
+            prompts: list[str] = []
+            batch_size = random.randint(max_batch_size // 2, max_batch_size)
+            needle_pos = random.randint(0, batch_size - 1)
+            for i in range(batch_size):
+                if i == needle_pos:
+                    prompts.append(needle_prompt)
+                else:
+                    prompts.append(_random_prompt(min_prompt, max_prompt))
+
+            # Measure time for this trial
+            start_time = time.perf_counter()
+            outputs = llm.generate(prompts, sampling)
+            trial_time = time.perf_counter() - start_time
+
+            trial_times.append(trial_time)
+            total_prompts += len(prompts)
+
+            # Count tokens
+            for output in outputs:
+                if output.outputs:
+                    total_tokens += len(output.outputs[0].token_ids)
+
+            print(
+                f"Trial {trial + 1}/{num_trials}: "
+                f"batch_size={batch_size}, "
+                f"time={trial_time:.2f}s"
+            )
+
+            # Verify needle output still matches
+            needle_output = outputs[needle_pos]
+            assert needle_output.prompt == needle_prompt
+
+        # Compute statistics
+        avg_time = sum(trial_times) / len(trial_times)
+        min_time = min(trial_times)
+        max_time = max(trial_times)
+        throughput = total_tokens / sum(trial_times)
+        prompts_per_sec = total_prompts / sum(trial_times)
+
+        print(f"\n{'=' * 80}")
+        print("RESULTS:")
+        print(f"  Average time per trial: {avg_time:.2f}s")
+        print(f"  Min time: {min_time:.2f}s")
+        print(f"  Max time: {max_time:.2f}s")
+        print(f"  Total tokens generated: {total_tokens}")
+        print(f"  Total prompts processed: {total_prompts}")
+        print(f"  Throughput: {throughput:.2f} tokens/s")
+        print(f"  Prompts/s: {prompts_per_sec:.2f}")
+        print(f"{'=' * 80}\n")
+
+        return {
+            "init_time": init_time,
+            "avg_time": avg_time,
+            "min_time": min_time,
+            "max_time": max_time,
+            "total_tokens": total_tokens,
+            "total_prompts": total_prompts,
+            "throughput": throughput,
+            "prompts_per_sec": prompts_per_sec,
+            "trial_times": trial_times,
+        }
+
+    finally:
+        # Cleanup
+        if llm is not None:
+            with contextlib.suppress(Exception):
+                llm.shutdown()
+
+
+def main():
+    # Check platform support
+    if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
+        print("ERROR: Requires CUDA and >= Hopper (SM90)")
+        print(f"Current platform: {current_platform.device_type}")
+        if current_platform.is_cuda():
+            print(f"Device capability: {current_platform.get_device_capability()}")
+        return 1
+
+    # Read configuration from environment
+    model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
+    max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
+    num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
+    min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
+    max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
+    max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
+    temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
+    gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
+    max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
+    backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
+
+    print("\n" + "=" * 80)
+    print("VLLM BATCH INVARIANCE BENCHMARK")
+    print("=" * 80)
+    print("\nConfiguration:")
+    print(f"  Model: {model}")
+    print(f"  Tensor Parallel Size: {tp_size}")
+    print(f"  Attention Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Number of Trials: {num_trials}")
+    print(f"  Prompt Length Range: {min_prompt}-{max_prompt} words")
+    print(f"  Max Tokens to Generate: {max_tokens}")
+    print(f"  Temperature: {temperature}")
+    print(f"  GPU Memory Utilization: {gpu_mem_util}")
+    print(f"  Max Model Length: {max_model_len}")
+    print("=" * 80)
+
+    # Run benchmark WITHOUT batch invariance (baseline)
+    print("\n" + "=" * 80)
+    print("PHASE 1: Running WITHOUT batch invariance (baseline)")
+    print("=" * 80)
+    baseline_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=False,
+    )
+
+    # Run benchmark WITH batch invariance
+    print("\n" + "=" * 80)
+    print("PHASE 2: Running WITH batch invariance")
+    print("=" * 80)
+    batch_inv_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=True,
+    )
+
+    # Compare results
+    print("\n" + "=" * 80)
+    print("COMPARISON: Batch Invariance vs Baseline")
+    print("=" * 80)
+
+    init_overhead_pct = (
+        (batch_inv_results["init_time"] - baseline_results["init_time"])
+        / baseline_results["init_time"]
+        * 100
+    )
+    time_overhead_pct = (
+        (batch_inv_results["avg_time"] - baseline_results["avg_time"])
+        / baseline_results["avg_time"]
+        * 100
+    )
+    throughput_change_pct = (
+        (batch_inv_results["throughput"] - baseline_results["throughput"])
+        / baseline_results["throughput"]
+        * 100
+    )
+
+    print("\nInitialization Time:")
+    print(f"  Baseline:         {baseline_results['init_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['init_time']:.2f}s")
+    print(f"  Overhead:         {init_overhead_pct:+.2f}%")
+
+    print("\nAverage Trial Time:")
+    print(f"  Baseline:         {baseline_results['avg_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['avg_time']:.2f}s")
+    print(f"  Overhead:         {time_overhead_pct:+.2f}%")
+
+    print("\nThroughput (tokens/s):")
+    print(f"  Baseline:         {baseline_results['throughput']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['throughput']:.2f}")
+    print(f"  Change:           {throughput_change_pct:+.2f}%")
+
+    print("\nPrompts/s:")
+    print(f"  Baseline:         {baseline_results['prompts_per_sec']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['prompts_per_sec']:.2f}")
+
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    if time_overhead_pct > 0:
+        print(
+            f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
+            "overhead"
+        )
+    else:
+        print(
+            f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
+            "faster (unexpected!)"
+        )
+
+    if abs(throughput_change_pct) < 1.0:
+        print("Throughput difference is negligible (< 1%)")
+    elif throughput_change_pct < 0:
+        print(
+            f"Throughput decreased by {-throughput_change_pct:.1f}% "
+            "with batch invariance"
+        )
+    else:
+        print(
+            f"Throughput increased by {throughput_change_pct:.1f}% "
+            "with batch invariance (unexpected!)"
+        )
+
+    print("=" * 80 + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+
+from benchmark_utils import TimeCollector
+from tabulate import tabulate
+
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+from vllm.v1.core.block_pool import BlockPool
+
+
+def main(args):
+    rows = []
+    for allocate_block in args.allocate_blocks:
+        # Enforce a GC collect ahead to minimize the impact among runs
+        gc.collect()
+        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
+
+        get_blocks_times = TimeCollector(TimeCollector.US)
+        free_blocks_times = TimeCollector(TimeCollector.US)
+        for _ in range(args.num_iteration):
+            with get_blocks_times:
+                blocks = block_pool.get_new_blocks(allocate_block)
+            with free_blocks_times:
+                block_pool.free_blocks(blocks)
+
+        rows.append(
+            [get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
+            + get_blocks_times.dump_avg_max()
+            + free_blocks_times.dump_avg_max()
+        )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "Iterations",
+                "Total\nBlocks",
+                "Allocated\nBlocks",
+                "Get Blocks\nAvg (us)",
+                "Get Blocks\nMax (us)",
+                "Free Blocks\nAvg (us)",
+                "Free Blocks\nMax (us)",
+            ],
+            tablefmt="grid",
+            floatfmt=".3f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of BlockPool for KV Cache."
+    )
+    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=1000,
+        help="Number of iterations to run to stabilize final data readings",
+    )
+    parser.add_argument(
+        "--allocate-blocks",
+        type=int,
+        nargs="*",
+        default=[10, 50, 100, 500, 1000],
+        help="Number of blocks to allocate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
--- a/benchmarks/benchmark_hash.py
+++ b/benchmarks/benchmark_hash.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
+
+This focuses on a single test payload shaped like the prefix-cache hash input:
+    (32-byte bytes object, 32-int tuple)
+
+Usage:
+    python benchmarks/hash_micro_benchmark.py --iterations 20000
+"""
+
+from __future__ import annotations
+
+import argparse
+import random
+import statistics
+import time
+from collections.abc import Callable, Iterable
+
+from vllm.utils.hashing import sha256, xxhash
+
+
+def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
+    """Generate a deterministic test payload."""
+    random.seed(seed)
+    bytes_data = bytes(random.getrandbits(8) for _ in range(32))
+    int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
+    return (bytes_data, int_tuple)
+
+
+def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
+    """Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
+    times: list[float] = []
+
+    # Warm-up to avoid first-run noise.
+    for _ in range(200):
+        func(data)
+
+    for _ in range(iterations):
+        start = time.perf_counter()
+        func(data)
+        end = time.perf_counter()
+        times.append(end - start)
+
+    avg = statistics.mean(times)
+    std = statistics.stdev(times) if len(times) > 1 else 0.0
+    return avg, std
+
+
+def _run_benchmarks(
+    benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
+    data: tuple,
+    iterations: int,
+):
+    """Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
+    for name, func in benchmarks:
+        try:
+            avg, std = _benchmark_func(func, data, iterations)
+        except ModuleNotFoundError as exc:
+            print(f"Skipping {name}: {exc}")
+            continue
+        yield name, avg, std
+
+
+def builtin_hash(data: tuple) -> int:
+    """Wrapper for Python's built-in hash()."""
+    return hash(data)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=10_000,
+        help="Number of measured iterations per hash function.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for test payload."
+    )
+    args = parser.parse_args()
+
+    data = _generate_test_data(args.seed)
+    benchmarks = (
+        ("SHA256 (pickle)", sha256),
+        ("xxHash (pickle)", xxhash),
+        ("built-in hash()", builtin_hash),
+    )
+
+    print("=" * 60)
+    print("HASH FUNCTION MICRO BENCHMARK")
+    print("=" * 60)
+    print("Test data: (32-byte bytes object, 32-int tuple)")
+    print(f"Iterations: {args.iterations:,}")
+    print("=" * 60)
+
+    results = list(_run_benchmarks(benchmarks, data, args.iterations))
+    builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
+
+    print("\nResults:")
+    for name, avg, std in results:
+        print(f"  {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
+
+    if builtin_entry:
+        _, builtin_avg, _ = builtin_entry
+        print("\n" + "=" * 60)
+        print("SUMMARY (relative to built-in hash())")
+        print("=" * 60)
+        for name, avg, _ in results:
+            if name == "built-in hash()":
+                continue
+            speed_ratio = avg / builtin_avg
+            print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
+    else:
+        print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import sys
+
+if __name__ == "__main__":
+    print("""DEPRECATED: This script has been moved to the vLLM CLI.
+
+Please use the following command instead:
+    vllm bench latency
+
+For help with the new command, run:
+    vllm bench latency --help
+
+Alternatively, you can run the new command directly with:
+    python -m vllm.entrypoints.cli.main bench latency --help
+""")
+    sys.exit(1)