Commit fbeb8a6f authored by raojy's avatar raojy
Browse files

raw_vllm

parent 2ca8867f
Pipeline #3454 canceled with stages
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""vLLM Attention Benchmarking Suite."""
from .batch_spec import (
BatchRequest,
format_batch_spec,
get_batch_stats,
parse_batch_spec,
reorder_for_flashinfer,
split_by_type,
)
from .common import (
BenchmarkConfig,
BenchmarkResult,
MockLayer,
ResultsFormatter,
get_attention_scale,
is_mla_backend,
setup_mla_dims,
)
__all__ = [
# Batch specification
"BatchRequest",
"parse_batch_spec",
"format_batch_spec",
"reorder_for_flashinfer",
"split_by_type",
"get_batch_stats",
# Benchmarking infrastructure
"BenchmarkConfig",
"BenchmarkResult",
"ResultsFormatter",
# Mock objects
"MockLayer",
# Utilities
"setup_mla_dims",
"get_attention_scale",
"is_mla_backend",
]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Simplified batch specification grammar for attention benchmarks.
Grammar (underscore-separated segments):
Format: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
- count: Number of identical requests (optional, default=1)
- q_len: Query length (number of new tokens)
- seq_len: Total sequence length (optional, defaults to q_len for prefill)
- 'k' suffix: Multiplies value by 1024
Common patterns:
- Prefill: q_len == seq_len (e.g., "q2k" → 2048 new tokens, 2048 seq)
- Decode: q_len == 1 (e.g., "q1s1k" → 1 token, 1024 seq length)
- Extend: q_len < seq_len (e.g., "q4s1k" → 4 tokens, 1024 seq length)
Examples:
q2k -> [(2048, 2048)] # Prefill: 2048 tokens
q1s1k -> [(1, 1024)] # Decode: 1 token, 1K sequence
8q1s1k -> [(1, 1024)] * 8 # 8 decode requests
q4s1k -> [(4, 1024)] # 4-token extend (spec decode)
2q1k_32q1s1k -> [(1024, 1024)] * 2 + [(1, 1024)] * 32 # Mixed batch
16q4s1k -> [(4, 1024)] * 16 # 16 spec decode requests
"""
from collections import Counter
from dataclasses import dataclass
import regex as re
@dataclass
class BatchRequest:
"""Represents a single request in a batch."""
q_len: int # Query length (number of new tokens)
kv_len: int # Total KV cache length
@property
def is_decode(self) -> bool:
"""True if this is a decode request (q_len == 1)."""
return self.q_len == 1
@property
def is_prefill(self) -> bool:
"""True if this is a pure prefill (q_len == kv_len)."""
return self.q_len == self.kv_len
@property
def is_extend(self) -> bool:
"""True if this is context extension (q_len > 1, kv_len > q_len)."""
return self.q_len > 1 and self.kv_len > self.q_len
@property
def context_len(self) -> int:
"""Context length (KV cache - query)."""
return self.kv_len - self.q_len
def as_tuple(self) -> tuple[int, int]:
"""Return as (q_len, kv_len) tuple for compatibility."""
return (self.q_len, self.kv_len)
def _parse_size(size_str: str, k_suffix: str) -> int:
"""Parse size string with optional 'k' suffix."""
size = int(size_str)
return size * 1024 if k_suffix == "k" else size
def parse_batch_spec(spec: str) -> list[BatchRequest]:
"""
Parse batch specification string into list of BatchRequest objects.
Grammar: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
Args:
spec: Batch specification string (see module docstring for grammar)
Returns:
List of BatchRequest objects
Raises:
ValueError: If spec format is invalid
"""
requests = []
for seg in spec.split("_"):
# Unified pattern: (<count>?) q<q_len>(k?) (s<seq_len>(k?))?
m = re.match(r"^(?:(\d+))?q(\d+)(k?)(?:s(\d+)(k?))?$", seg)
if m:
cnt = int(m.group(1)) if m.group(1) else 1
q_len = _parse_size(m.group(2), m.group(3))
kv_len = _parse_size(m.group(4), m.group(5)) if m.group(4) else q_len
requests.extend([BatchRequest(q_len=q_len, kv_len=kv_len)] * cnt)
continue
raise ValueError(f"Invalid batch spec segment: '{seg}'")
return requests
def format_batch_spec(requests: list[BatchRequest]) -> str:
"""
Format list of BatchRequest into human-readable string.
Groups requests by type and provides counts and sizes.
Args:
requests: List of BatchRequest objects
Returns:
Formatted string describing the batch
"""
kinds = {
"prefill": [],
"extend": [],
"decode": [],
}
for req in requests:
tup = (req.q_len, req.kv_len)
if req.is_prefill:
kinds["prefill"].append(tup)
elif req.is_extend:
kinds["extend"].append(tup)
elif req.is_decode:
kinds["decode"].append(tup)
parts = []
for kind in ["prefill", "extend", "decode"]:
lst = kinds[kind]
if not lst:
continue
cnt_total = len(lst)
ctr = Counter(lst)
inner = []
for (q, kv), cnt in ctr.items():
if kind == "prefill":
size = f"{q // 1024}k" if q % 1024 == 0 else str(q)
inner.append(f"{cnt}x{size}")
elif kind == "decode":
size = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
inner.append(f"{cnt}x{size}")
else: # extend
qstr = f"{q // 1024}k" if q % 1024 == 0 else str(q)
kstr = f"{kv // 1024}k" if kv % 1024 == 0 else str(kv)
inner.append(f"{cnt}xq{qstr}kv{kstr}")
parts.append(f"{cnt_total} {kind} ({', '.join(inner)})")
return ", ".join(parts)
def reorder_for_flashinfer(requests: list[BatchRequest]) -> list[BatchRequest]:
"""
Reorder requests for FlashInfer: decode first, then prefill.
FlashInfer expects decode requests before prefill requests for
optimal performance.
Args:
requests: Original list of BatchRequest
Returns:
Reordered list with decode requests first
"""
decodes = [r for r in requests if r.is_decode]
non_decodes = [r for r in requests if not r.is_decode]
return decodes + non_decodes
def split_by_type(
requests: list[BatchRequest],
) -> dict[str, list[BatchRequest]]:
"""
Split requests by type for analysis.
Args:
requests: List of BatchRequest
Returns:
Dict with keys: 'decode', 'prefill', 'extend'
"""
result = {
"decode": [],
"prefill": [],
"extend": [],
}
for req in requests:
if req.is_decode:
result["decode"].append(req)
elif req.is_prefill:
result["prefill"].append(req)
elif req.is_extend:
result["extend"].append(req)
return result
def get_batch_stats(requests: list[BatchRequest]) -> dict:
"""
Compute statistics about a batch.
Args:
requests: List of BatchRequest
Returns:
Dict with batch statistics
"""
by_type = split_by_type(requests)
return {
"total_requests": len(requests),
"num_decode": len(by_type["decode"]),
"num_prefill": len(by_type["prefill"]),
"num_extend": len(by_type["extend"]),
"total_tokens": sum(r.q_len for r in requests),
"total_kv_cache": sum(r.kv_len for r in requests),
"max_q_len": max((r.q_len for r in requests), default=0),
"max_kv_len": max((r.kv_len for r in requests), default=0),
"avg_q_len": sum(r.q_len for r in requests) / len(requests) if requests else 0,
"avg_kv_len": (
sum(r.kv_len for r in requests) / len(requests) if requests else 0
),
}
def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str:
"""
Classify a batch spec into a type string.
Args:
batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k")
spec_decode_threshold: Max q_len to be considered spec-decode vs extend
Returns:
Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)"
"""
requests = parse_batch_spec(batch_spec)
# Classify each request
types_present = set()
for req in requests:
if req.is_decode:
types_present.add("decode")
elif req.is_prefill:
types_present.add("prefill")
elif req.is_extend:
# Distinguish spec-decode (small q_len) from extend (chunked prefill)
if req.q_len <= spec_decode_threshold:
types_present.add("spec-decode")
else:
types_present.add("extend")
if len(types_present) == 1:
return types_present.pop()
elif len(types_present) > 1:
# Sort for consistent output
sorted_types = sorted(types_present)
return f"mixed ({'+'.join(sorted_types)})"
else:
return "unknown"
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Universal vLLM Attention Benchmark
Benchmark any attention backend with the extended grammar.
Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.
Examples:
# Standard attention
python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"
# MLA backends
python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"
# Parameter sweep (CLI)
python benchmark.py --backend cutlass_mla \
--batch-specs "64q1s1k" \
--sweep-param num_kv_splits \
--sweep-values 1 4 8 16
# Parameter sweep (YAML config - recommended)
python benchmark.py --config configs/cutlass_numsplits.yaml
"""
import argparse
import sys
from dataclasses import replace
from pathlib import Path
import yaml
from rich.console import Console
from tqdm import tqdm
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from batch_spec import parse_batch_spec
from common import (
BenchmarkConfig,
BenchmarkResult,
ModelParameterSweep,
ParameterSweep,
ResultsFormatter,
batch_spec_sort_key,
is_mla_backend,
)
def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
"""Run standard attention benchmark (Flash/Triton/FlashInfer)."""
from runner import run_attention_benchmark
return run_attention_benchmark(config)
def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
"""Run MLA benchmark with appropriate backend."""
from mla_runner import run_mla_benchmark as run_mla
return run_mla(config.backend, config, **kwargs)
def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
"""
Run a single benchmark with proper backend selection.
Args:
config: BenchmarkConfig with backend, batch_spec, and model params
**kwargs: Additional arguments passed to MLA benchmarks
Returns:
BenchmarkResult (may have error field set on failure)
"""
try:
if is_mla_backend(config.backend):
return run_mla_benchmark(config, **kwargs)
else:
return run_standard_attention_benchmark(config)
except Exception as e:
return BenchmarkResult(
config=config,
mean_time=float("inf"),
std_time=0,
min_time=float("inf"),
max_time=float("inf"),
error=str(e),
)
def run_model_parameter_sweep(
backends: list[str],
batch_specs: list[str],
base_config_args: dict,
sweep: ModelParameterSweep,
console: Console,
) -> list[BenchmarkResult]:
"""
Run model parameter sweep for given backends and batch specs.
Args:
backends: List of backend names
batch_specs: List of batch specifications
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
sweep: ModelParameterSweep configuration
console: Rich console for output
Returns:
List of BenchmarkResult objects
"""
all_results = []
console.print(
f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
)
total = len(backends) * len(batch_specs) * len(sweep.values)
with tqdm(total=total, desc="Benchmarking") as pbar:
for backend in backends:
for spec in batch_specs:
for value in sweep.values:
# Create config with modified model parameter
config_args = base_config_args.copy()
config_args[sweep.param_name] = value
# Create config with original backend for running
clean_config = BenchmarkConfig(
backend=backend, batch_spec=spec, **config_args
)
# Run benchmark
result = run_benchmark(clean_config)
# Replace backend with labeled version for display
backend_label = sweep.get_label(backend, value)
labeled_config = replace(result.config, backend=backend_label)
result = replace(result, config=labeled_config)
all_results.append(result)
if not result.success:
console.print(
f"[red]Error {backend} {spec} {sweep.param_name}="
f"{value}: {result.error}[/]"
)
pbar.update(1)
# Display sweep results - create separate table for each parameter value
console.print("\n[bold green]Model Parameter Sweep Results:[/]")
formatter = ResultsFormatter(console)
# Group results by parameter value and extract backend mapping
by_param_value = {}
backend_mapping = {} # Maps labeled backend -> original backend
for r in all_results:
# Extract original backend and param value from labeled backend
# The label format is: {backend}_{param_name}_{value}
# We need to reverse engineer this
labeled_backend = r.config.backend
# Try each backend to find which one this result belongs to
for backend in backends:
for value in sweep.values:
expected_label = sweep.get_label(backend, value)
if labeled_backend == expected_label:
backend_mapping[labeled_backend] = backend
param_value = str(value)
if param_value not in by_param_value:
by_param_value[param_value] = []
by_param_value[param_value].append(r)
break
# Create a table for each parameter value
sorted_param_values = sorted(
by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
)
for param_value in sorted_param_values:
console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
param_results = by_param_value[param_value]
# Create modified results with original backend names
modified_results = []
for r in param_results:
# Get the original backend name from our mapping
original_backend = backend_mapping[r.config.backend]
modified_config = replace(r.config, backend=original_backend)
modified_result = replace(r, config=modified_config)
modified_results.append(modified_result)
# Print table with original backend names
formatter.print_table(modified_results, backends, compare_to_fastest=True)
# Show optimal backend for each (param_value, batch_spec) combination
console.print(
f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
)
# Group by (param_value, batch_spec)
by_param_and_spec = {}
for r in all_results:
if r.success:
# Find which (backend, value) this result corresponds to
labeled_backend = r.config.backend
for backend in backends:
for value in sweep.values:
expected_label = sweep.get_label(backend, value)
if labeled_backend == expected_label:
param_value = str(value)
spec = r.config.batch_spec
key = (param_value, spec)
if key not in by_param_and_spec:
by_param_and_spec[key] = []
by_param_and_spec[key].append(r)
break
# Sort by param value then spec (batch_size, q_len, kv_len)
sorted_keys = sorted(
by_param_and_spec.keys(),
key=lambda x: (
int(x[0]) if x[0].isdigit() else x[0],
batch_spec_sort_key(x[1]),
),
)
current_param_value = None
for param_value, spec in sorted_keys:
# Print header when param value changes
if param_value != current_param_value:
console.print(f"\n [bold]{sweep.param_name}={param_value}:[/]")
current_param_value = param_value
results = by_param_and_spec[(param_value, spec)]
best = min(results, key=lambda r: r.mean_time)
# Extract original backend name using the mapping
backend_name = backend_mapping[best.config.backend]
# Show all backends' times for comparison
times_str = " | ".join(
[
f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
for r in sorted(results, key=lambda r: r.mean_time)
]
)
console.print(
f" {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
)
return all_results
def run_parameter_sweep(
backends: list[str],
batch_specs: list[str],
base_config_args: dict,
sweep: ParameterSweep,
console: Console,
) -> list[BenchmarkResult]:
"""
Run parameter sweep for given backends and batch specs.
Args:
backends: List of backend names
batch_specs: List of batch specifications
base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
sweep: ParameterSweep configuration
console: Rich console for output
Returns:
List of BenchmarkResult objects
"""
all_results = []
# Build list of values to sweep (including auto if requested)
sweep_values = list(sweep.values)
if sweep.include_auto:
sweep_values.append("auto")
console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")
total = len(backends) * len(batch_specs) * len(sweep_values)
with tqdm(total=total, desc="Benchmarking") as pbar:
for backend in backends:
for spec in batch_specs:
for value in sweep_values:
# Create config with original backend for running
config = BenchmarkConfig(
backend=backend, batch_spec=spec, **base_config_args
)
# Prepare kwargs for benchmark runner
kwargs = {}
if value != "auto":
kwargs[sweep.param_name] = value
# Run benchmark
result = run_benchmark(config, **kwargs)
# Replace backend with labeled version for display
backend_label = sweep.get_label(backend, value)
labeled_config = replace(result.config, backend=backend_label)
result = replace(result, config=labeled_config)
all_results.append(result)
if not result.success:
console.print(
f"[red]Error {backend} {spec} {sweep.param_name}="
f"{value}: {result.error}[/]"
)
pbar.update(1)
# Display sweep results
console.print("\n[bold green]Sweep Results:[/]")
backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
formatter = ResultsFormatter(console)
formatter.print_table(all_results, backend_labels)
# Show optimal values
console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
by_spec = {}
for r in all_results:
if r.success:
spec = r.config.batch_spec
if spec not in by_spec:
by_spec[spec] = []
by_spec[spec].append(r)
for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
results = by_spec[spec]
best = min(results, key=lambda r: r.mean_time)
console.print(
f" {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
)
return all_results
def load_config_from_yaml(config_path: str) -> dict:
"""Load configuration from YAML file."""
with open(config_path) as f:
return yaml.safe_load(f)
def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
"""
Generate batch specs from range specifications.
Args:
ranges: List of range specifications, each containing:
- template: Batch spec template (e.g., "q{q_len}kv1k")
- q_len: Dict with start, stop, step, end_inclusive (optional)
- Other parameters can also be ranges
Returns:
List of generated batch spec strings
Example:
ranges = [
{
"template": "q{q_len}kv1k",
"q_len": {
"start": 1,
"stop": 16,
"step": 1,
"end_inclusive": true # Optional, defaults to true
}
}
]
Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
"""
all_specs = []
for range_spec in ranges:
template = range_spec.get("template")
if not template:
raise ValueError("Range specification must include 'template'")
# Extract all range parameters from the spec
range_params = {}
for key, value in range_spec.items():
if key == "template":
continue
if isinstance(value, dict) and "start" in value:
# This is a range specification
start = value["start"]
stop = value["stop"]
step = value.get("step", 1)
# Check if end should be inclusive (default: True)
end_inclusive = value.get("end_inclusive", True)
# Adjust stop based on end_inclusive
if end_inclusive:
range_params[key] = list(range(start, stop + 1, step))
else:
range_params[key] = list(range(start, stop, step))
else:
# This is a fixed value
range_params[key] = [value]
# Generate all combinations (Cartesian product)
if range_params:
import itertools
param_names = list(range_params.keys())
param_values = [range_params[name] for name in param_names]
for values in itertools.product(*param_values):
params = dict(zip(param_names, values))
spec = template.format(**params)
all_specs.append(spec)
else:
# No parameters, just use template as-is
all_specs.append(template)
return all_specs
def main():
parser = argparse.ArgumentParser(
description="Universal vLLM attention benchmark",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
# Config file
parser.add_argument(
"--config",
help="Path to YAML config file (overrides other args)",
)
# Backend selection
parser.add_argument(
"--backends",
nargs="+",
help="Backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
"flashinfer_mla, flashattn_mla, flashmla)",
)
parser.add_argument(
"--backend",
help="Single backend (alternative to --backends)",
)
# Batch specifications
parser.add_argument(
"--batch-specs",
nargs="+",
default=["q2k", "8q1s1k"],
help="Batch specifications using extended grammar",
)
# Model config
parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
parser.add_argument("--block-size", type=int, default=16, help="Block size")
# Benchmark settings
parser.add_argument("--device", default="cuda:0", help="Device")
parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
# Parameter sweep (use YAML config for advanced sweeps)
parser.add_argument(
"--sweep-param",
help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
)
parser.add_argument(
"--sweep-values",
type=int,
nargs="+",
help="Values to sweep for the parameter",
)
# Output
parser.add_argument("--output-csv", help="Save to CSV")
parser.add_argument("--output-json", help="Save to JSON")
args = parser.parse_args()
console = Console()
console.print("[bold cyan]vLLM Attention Benchmark[/]")
# Load config from YAML if provided
if args.config:
console.print(f"[yellow]Loading config from: {args.config}[/]")
yaml_config = load_config_from_yaml(args.config)
# Show description if available
if "description" in yaml_config:
console.print(f"[dim]{yaml_config['description']}[/]")
# Override args with YAML values, but CLI args take precedence
# Check if CLI provided backends (they would be non-None and not default)
cli_backends_provided = args.backends is not None or args.backend is not None
# Backend(s) - only use YAML if CLI didn't specify
if not cli_backends_provided:
if "backend" in yaml_config:
args.backend = yaml_config["backend"]
args.backends = None
elif "backends" in yaml_config:
args.backends = yaml_config["backends"]
args.backend = None
# Check for special modes
if "mode" in yaml_config:
args.mode = yaml_config["mode"]
else:
args.mode = None
# Batch specs and sizes
# Support both explicit batch_specs and generated batch_spec_ranges
if "batch_spec_ranges" in yaml_config:
# Generate batch specs from ranges
generated_specs = generate_batch_specs_from_ranges(
yaml_config["batch_spec_ranges"]
)
# Combine with any explicit batch_specs
if "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"] + generated_specs
else:
args.batch_specs = generated_specs
console.print(
f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
)
elif "batch_specs" in yaml_config:
args.batch_specs = yaml_config["batch_specs"]
if "batch_sizes" in yaml_config:
args.batch_sizes = yaml_config["batch_sizes"]
else:
args.batch_sizes = None
# Model config
if "model" in yaml_config:
model = yaml_config["model"]
args.num_layers = model.get("num_layers", args.num_layers)
args.head_dim = model.get("head_dim", args.head_dim)
args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
args.block_size = model.get("block_size", args.block_size)
# Benchmark settings (top-level keys)
if "device" in yaml_config:
args.device = yaml_config["device"]
if "repeats" in yaml_config:
args.repeats = yaml_config["repeats"]
if "warmup_iters" in yaml_config:
args.warmup_iters = yaml_config["warmup_iters"]
if "profile_memory" in yaml_config:
args.profile_memory = yaml_config["profile_memory"]
# Parameter sweep configuration
if "parameter_sweep" in yaml_config:
sweep_config = yaml_config["parameter_sweep"]
args.parameter_sweep = ParameterSweep(
param_name=sweep_config["param_name"],
values=sweep_config["values"],
include_auto=sweep_config.get("include_auto", False),
label_format=sweep_config.get(
"label_format", "{backend}_{param_name}_{value}"
),
)
else:
args.parameter_sweep = None
# Model parameter sweep configuration
if "model_parameter_sweep" in yaml_config:
sweep_config = yaml_config["model_parameter_sweep"]
args.model_parameter_sweep = ModelParameterSweep(
param_name=sweep_config["param_name"],
values=sweep_config["values"],
label_format=sweep_config.get(
"label_format", "{backend}_{param_name}_{value}"
),
)
else:
args.model_parameter_sweep = None
# Output
if "output" in yaml_config:
output = yaml_config["output"]
if "csv" in output and not args.output_csv:
args.output_csv = output["csv"]
if "json" in output and not args.output_json:
args.output_json = output["json"]
console.print()
# Handle CLI-based parameter sweep (if not from YAML)
if (
(not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
and args.sweep_param
and args.sweep_values
):
args.parameter_sweep = ParameterSweep(
param_name=args.sweep_param,
values=args.sweep_values,
include_auto=False,
label_format="{backend}_{param_name}_{value}",
)
# Determine backends
backends = args.backends or ([args.backend] if args.backend else ["flash"])
console.print(f"Backends: {', '.join(backends)}")
console.print(f"Batch specs: {', '.join(args.batch_specs)}")
console.print()
# Run benchmarks
all_results = []
# Handle special mode: decode_vs_prefill comparison
if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
console.print(
"[dim]For each query length, testing both decode and prefill pipelines[/]"
)
console.print("[dim]Using batched execution for optimal performance[/]")
# Extract batch sizes from config
batch_sizes = getattr(args, "batch_sizes", [1])
backend = backends[0] # Use first backend (should only be one)
# Calculate total benchmarks
total = len(batch_sizes)
with tqdm(total=total, desc="Benchmarking") as pbar:
for batch_size in batch_sizes:
# Prepare all configs for this batch size
configs_with_thresholds = []
for spec in args.batch_specs:
# Parse the batch spec to get query length
requests = parse_batch_spec(spec)
if not requests:
console.print(
f"[red]Error: Could not parse batch spec '{spec}'[/]"
)
continue
# Get query length from first request
query_length = requests[0].q_len
# Create batch spec for this batch size
# For batch_size > 1, we need to prepend the count
batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec
# Create base config (without backend name)
base_config = BenchmarkConfig(
backend=backend, # Will be overridden later
batch_spec=batch_spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
)
# Add decode pipeline config
decode_threshold = query_length
config_decode = replace(
base_config,
backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
)
configs_with_thresholds.append((config_decode, decode_threshold))
# Add prefill pipeline config if query_length > 1
if query_length > 1:
prefill_threshold = query_length - 1
config_prefill = replace(
base_config,
backend=f"{backend}_prefill_qlen{query_length}"
f"_bs{batch_size}",
)
configs_with_thresholds.append(
(config_prefill, prefill_threshold)
)
# Run all benchmarks for this batch size in one go (batched mode)
try:
from mla_runner import run_mla_benchmark as run_mla
# Use batched API: pass list of (config, threshold) tuples
timing_results = run_mla(backend, configs_with_thresholds)
# Create BenchmarkResult objects from timing results
for (config, _), timing in zip(
configs_with_thresholds, timing_results
):
result = BenchmarkResult(
config=config,
mean_time=timing["mean"],
std_time=timing["std"],
min_time=timing["min"],
max_time=timing["max"],
throughput_tokens_per_sec=timing.get("throughput", None),
)
all_results.append(result)
except Exception as e:
import traceback
console.print(
f"[red]Error running batched benchmarks for "
f"batch_size={batch_size}: {e}[/]"
)
console.print("[red]Traceback:[/]")
traceback.print_exc()
# Add error results for all configs
for config, _ in configs_with_thresholds:
result = BenchmarkResult(
config=config,
mean_time=float("inf"),
std_time=0,
min_time=float("inf"),
max_time=float("inf"),
error=str(e),
)
all_results.append(result)
pbar.update(1)
# Display decode vs prefill results
console.print("\n[bold green]Decode vs Prefill Results:[/]")
# Group by batch size
by_batch_size = {}
for r in all_results:
if r.success:
# Extract batch size from backend name
parts = r.config.backend.split("_")
bs_part = [p for p in parts if p.startswith("bs")]
if bs_part:
bs = int(bs_part[0][2:])
if bs not in by_batch_size:
by_batch_size[bs] = []
by_batch_size[bs].append(r)
# For each batch size, analyze crossover point
for bs in sorted(by_batch_size.keys()):
console.print(f"\n[bold cyan]Batch size: {bs}[/]")
results = by_batch_size[bs]
# Group by query length
by_qlen = {}
for r in results:
parts = r.config.backend.split("_")
qlen_part = [p for p in parts if p.startswith("qlen")]
if qlen_part:
qlen = int(qlen_part[0][4:])
if qlen not in by_qlen:
by_qlen[qlen] = {}
pipeline = "decode" if "decode" in r.config.backend else "prefill"
by_qlen[qlen][pipeline] = r
# Find crossover point
last_decode_faster = None
for qlen in sorted(by_qlen.keys()):
pipelines = by_qlen[qlen]
if "decode" in pipelines and "prefill" in pipelines:
decode_time = pipelines["decode"].mean_time
prefill_time = pipelines["prefill"].mean_time
faster = "decode" if decode_time < prefill_time else "prefill"
speedup = (
prefill_time / decode_time
if decode_time < prefill_time
else decode_time / prefill_time
)
console.print(
f" qlen={qlen:3d}: decode={decode_time:.6f}s, "
f"prefill={prefill_time:.6f}s -> "
f"[bold]{faster}[/] ({speedup:.2f}x)"
)
if faster == "decode":
last_decode_faster = qlen
if last_decode_faster is not None:
optimal_threshold = last_decode_faster
console.print(
f"\n [bold green]Optimal threshold for batch_size={bs}: "
f"{optimal_threshold}[/]"
)
console.print(
f" [dim](Use decode pipeline for query_length <= "
f"{optimal_threshold})[/]"
)
else:
console.print(
f"\n [yellow]Prefill always faster for batch_size={bs}[/]"
)
# Handle model parameter sweep mode
elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
# Model parameter sweep
base_config_args = {
"num_layers": args.num_layers,
"head_dim": args.head_dim,
"num_q_heads": args.num_q_heads,
"num_kv_heads": args.num_kv_heads,
"block_size": args.block_size,
"device": args.device,
"repeats": args.repeats,
"warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory,
}
all_results = run_model_parameter_sweep(
backends,
args.batch_specs,
base_config_args,
args.model_parameter_sweep,
console,
)
# Handle parameter sweep mode (unified)
elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
# Unified parameter sweep
base_config_args = {
"num_layers": args.num_layers,
"head_dim": args.head_dim,
"num_q_heads": args.num_q_heads,
"num_kv_heads": args.num_kv_heads,
"block_size": args.block_size,
"device": args.device,
"repeats": args.repeats,
"warmup_iters": args.warmup_iters,
"profile_memory": args.profile_memory,
}
all_results = run_parameter_sweep(
backends, args.batch_specs, base_config_args, args.parameter_sweep, console
)
else:
# Normal mode: compare backends
total = len(backends) * len(args.batch_specs)
with tqdm(total=total, desc="Benchmarking") as pbar:
for spec in args.batch_specs:
for backend in backends:
config = BenchmarkConfig(
backend=backend,
batch_spec=spec,
num_layers=args.num_layers,
head_dim=args.head_dim,
num_q_heads=args.num_q_heads,
num_kv_heads=args.num_kv_heads,
block_size=args.block_size,
device=args.device,
repeats=args.repeats,
warmup_iters=args.warmup_iters,
profile_memory=args.profile_memory,
)
result = run_benchmark(config)
all_results.append(result)
if not result.success:
console.print(f"[red]Error {backend} {spec}: {result.error}[/]")
pbar.update(1)
# Display results
console.print("\n[bold green]Results:[/]")
formatter = ResultsFormatter(console)
formatter.print_table(all_results, backends)
# Save results
if all_results:
formatter = ResultsFormatter(console)
if args.output_csv:
formatter.save_csv(all_results, args.output_csv)
if args.output_json:
formatter.save_json(all_results, args.output_json)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Common utilities for attention benchmarking."""
import csv
import json
import math
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
import torch
from batch_spec import get_batch_type, parse_batch_spec
from rich.console import Console
from rich.table import Table
def batch_spec_sort_key(spec: str) -> tuple[int, int, int]:
"""
Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len).
This ensures results are sorted by batch size first, then query length,
then sequence length, rather than alphabetically.
"""
try:
requests = parse_batch_spec(spec)
batch_size = len(requests)
max_q_len = max(r.q_len for r in requests) if requests else 0
max_kv_len = max(r.kv_len for r in requests) if requests else 0
return (batch_size, max_q_len, max_kv_len)
except Exception:
# Fallback for unparseable specs
return (0, 0, 0)
# Mock classes for vLLM attention infrastructure
class MockHfConfig:
"""Mock HuggingFace config that satisfies vLLM's requirements."""
def __init__(self, mla_dims: dict, index_topk: int | None = None):
self.num_attention_heads = mla_dims["num_q_heads"]
self.num_key_value_heads = mla_dims["num_kv_heads"]
self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"]
self.model_type = "deepseek_v2"
self.is_encoder_decoder = False
self.kv_lora_rank = mla_dims["kv_lora_rank"]
self.qk_nope_head_dim = mla_dims["qk_nope_head_dim"]
self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"]
self.v_head_dim = mla_dims["v_head_dim"]
self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]
if index_topk is not None:
self.index_topk = index_topk
def get_text_config(self):
return self
# Import AttentionLayerBase at module level to avoid circular dependencies
try:
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
except ImportError:
AttentionLayerBase = object # Fallback
class MockKVBProj:
"""Mock KV projection layer for MLA prefill mode.
Mimics ColumnParallelLinear behavior for kv_b_proj in MLA backends.
Projects kv_c_normed to [qk_nope_head_dim + v_head_dim] per head.
"""
def __init__(self, num_heads: int, qk_nope_head_dim: int, v_head_dim: int):
self.num_heads = num_heads
self.qk_nope_head_dim = qk_nope_head_dim
self.v_head_dim = v_head_dim
self.out_dim = qk_nope_head_dim + v_head_dim
def __call__(self, x: torch.Tensor) -> tuple[torch.Tensor]:
"""
Project kv_c_normed to output space.
Args:
x: Input tensor [num_tokens, kv_lora_rank]
Returns:
Tuple containing output tensor
[num_tokens, num_heads, qk_nope_head_dim + v_head_dim]
"""
num_tokens = x.shape[0]
result = torch.randn(
num_tokens,
self.num_heads,
self.out_dim,
device=x.device,
dtype=x.dtype,
)
return (result,) # Return as tuple to match ColumnParallelLinear API
class MockIndexer:
"""Mock Indexer for sparse MLA backends.
Provides topk_indices_buffer that sparse MLA backends use to determine
which KV cache slots to attend to for each token.
"""
def __init__(
self,
max_num_tokens: int,
topk_tokens: int,
device: torch.device,
):
self.topk_tokens = topk_tokens
self.topk_indices_buffer = torch.zeros(
(max_num_tokens, topk_tokens),
dtype=torch.int32,
device=device,
)
def fill_random_indices(self, num_tokens: int, max_kv_len: int):
"""Fill topk_indices_buffer with random valid indices for benchmarking."""
indices = torch.randint(
0,
max_kv_len,
(num_tokens, self.topk_tokens),
dtype=torch.int32,
device=self.topk_indices_buffer.device,
)
self.topk_indices_buffer[:num_tokens] = indices
class MockLayer(AttentionLayerBase):
"""Mock attention layer with scale parameters and impl.
Inherits from AttentionLayerBase so it passes isinstance checks
in get_layers_from_vllm_config when FlashInfer prefill is enabled.
"""
def __init__(self, device: torch.device, impl=None, kv_cache_spec=None):
# Don't call super().__init__() as AttentionLayerBase doesn't have __init__
self._k_scale = torch.tensor(1.0, device=device)
self._v_scale = torch.tensor(1.0, device=device)
self._q_scale = torch.tensor(1.0, device=device)
# Scalar floats for kernels that need them
self._k_scale_float = float(self._k_scale.item())
self._v_scale_float = float(self._v_scale.item())
self._q_scale_float = float(self._q_scale.item())
# AttentionImpl for metadata builders to query
self.impl = impl
# KV cache spec for get_kv_cache_spec
self._kv_cache_spec = kv_cache_spec
def get_attn_backend(self):
"""Get the attention backend class (required by AttentionLayerBase)."""
# Return None as this is just a mock layer for benchmarking
return None
def get_kv_cache_spec(self):
"""Get the KV cache spec (required by AttentionLayerBase)."""
return self._kv_cache_spec
@dataclass
class ParameterSweep:
"""Configuration for sweeping a backend parameter."""
param_name: str # Name of the backend parameter to sweep
values: list[Any] # List of values to test
include_auto: bool = False # Also test with param unset (auto mode)
label_format: str = "{backend}_{param_name}_{value}" # Result label template
def get_label(self, backend: str, value: Any) -> str:
"""Generate a label for a specific parameter value."""
return self.label_format.format(
backend=backend, param_name=self.param_name, value=value
)
@dataclass
class ModelParameterSweep:
"""Configuration for sweeping a model configuration parameter."""
param_name: str # Name of the model config parameter to sweep (e.g., "num_q_heads")
values: list[Any] # List of values to test
label_format: str = "{backend}_{param_name}_{value}" # Result label template
def get_label(self, backend: str, value: Any) -> str:
"""Generate a label for a specific parameter value."""
return self.label_format.format(
backend=backend, param_name=self.param_name, value=value
)
@dataclass
class BenchmarkConfig:
"""Configuration for a single benchmark run."""
backend: str
batch_spec: str
num_layers: int
head_dim: int
num_q_heads: int
num_kv_heads: int
block_size: int
device: str
dtype: torch.dtype = torch.float16
repeats: int = 1
warmup_iters: int = 3
profile_memory: bool = False
use_cuda_graphs: bool = False
# MLA-specific
kv_lora_rank: int | None = None
qk_nope_head_dim: int | None = None
qk_rope_head_dim: int | None = None
v_head_dim: int | None = None
# Backend-specific tuning
num_kv_splits: int | None = None # CUTLASS MLA
reorder_batch_threshold: int | None = None # FlashAttn MLA, FlashMLA
@dataclass
class BenchmarkResult:
"""Results from a single benchmark run."""
config: BenchmarkConfig
mean_time: float # seconds
std_time: float # seconds
min_time: float # seconds
max_time: float # seconds
throughput_tokens_per_sec: float | None = None
memory_allocated_mb: float | None = None
memory_reserved_mb: float | None = None
error: str | None = None
@property
def success(self) -> bool:
"""Whether benchmark completed successfully."""
return self.error is None
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
"config": asdict(self.config),
"mean_time": self.mean_time,
"std_time": self.std_time,
"min_time": self.min_time,
"max_time": self.max_time,
"throughput_tokens_per_sec": self.throughput_tokens_per_sec,
"memory_allocated_mb": self.memory_allocated_mb,
"memory_reserved_mb": self.memory_reserved_mb,
"error": self.error,
}
class ResultsFormatter:
"""Format and display benchmark results."""
def __init__(self, console: Console | None = None):
self.console = console or Console()
def print_table(
self,
results: list[BenchmarkResult],
backends: list[str],
compare_to_fastest: bool = True,
):
"""
Print results as a rich table.
Args:
results: List of BenchmarkResult
backends: List of backend names being compared
compare_to_fastest: Show percentage comparison to fastest
"""
# Group by batch spec, preserving first-occurrence order
by_spec = {}
specs_order = []
for r in results:
spec = r.config.batch_spec
if spec not in by_spec:
by_spec[spec] = {}
specs_order.append(spec)
by_spec[spec][r.config.backend] = r
# Sort specs by (batch_size, q_len, kv_len) instead of alphabetically
specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key)
# Create shortened backend names for display
def shorten_backend_name(name: str) -> str:
"""Shorten long backend names for table display."""
# Remove common prefixes
name = name.replace("flashattn_mla", "famla")
name = name.replace("flashinfer_mla", "fimla")
name = name.replace("flashmla", "fmla")
name = name.replace("cutlass_mla", "cmla")
name = name.replace("numsplits", "ns")
return name
table = Table(title="Attention Benchmark Results")
table.add_column("Batch\nSpec", no_wrap=True)
table.add_column("Type", no_wrap=True)
table.add_column("Batch\nSize", justify="right", no_wrap=True)
multi = len(backends) > 1
for backend in backends:
short_name = shorten_backend_name(backend)
# Time column
col_time = f"{short_name}\nTime (s)"
table.add_column(col_time, justify="right", no_wrap=False)
if multi and compare_to_fastest:
# Relative performance column
col_rel = f"{short_name}\nvs Best"
table.add_column(col_rel, justify="right", no_wrap=False)
# Add rows
for spec in specs_order:
spec_results = by_spec[spec]
times = {b: r.mean_time for b, r in spec_results.items() if r.success}
best_time = min(times.values()) if times else 0.0
batch_type = get_batch_type(spec)
batch_size = len(parse_batch_spec(spec))
row = [spec, batch_type, str(batch_size)]
for backend in backends:
if backend in spec_results:
r = spec_results[backend]
if r.success:
row.append(f"{r.mean_time:.6f}")
if multi and compare_to_fastest:
pct = (
(r.mean_time / best_time * 100) if best_time > 0 else 0
)
pct_str = f"{pct:.1f}%"
if r.mean_time == best_time:
pct_str = f"[bold green]{pct_str}[/]"
row.append(pct_str)
else:
row.append("[red]ERROR[/]")
if multi and compare_to_fastest:
row.append("-")
else:
row.append("-")
if multi and compare_to_fastest:
row.append("-")
table.add_row(*row)
self.console.print(table)
def save_csv(self, results: list[BenchmarkResult], path: str):
"""Save results to CSV file."""
if not results:
return
path_obj = Path(path)
path_obj.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", newline="") as f:
writer = csv.DictWriter(
f,
fieldnames=[
"backend",
"batch_spec",
"num_layers",
"mean_time",
"std_time",
"throughput",
"memory_mb",
],
)
writer.writeheader()
for r in results:
writer.writerow(
{
"backend": r.config.backend,
"batch_spec": r.config.batch_spec,
"num_layers": r.config.num_layers,
"mean_time": r.mean_time,
"std_time": r.std_time,
"throughput": r.throughput_tokens_per_sec or 0,
"memory_mb": r.memory_allocated_mb or 0,
}
)
self.console.print(f"[green]Saved CSV results to {path}[/]")
def save_json(self, results: list[BenchmarkResult], path: str):
"""Save results to JSON file."""
path_obj = Path(path)
path_obj.parent.mkdir(parents=True, exist_ok=True)
data = [r.to_dict() for r in results]
with open(path, "w") as f:
json.dump(data, f, indent=2, default=str)
self.console.print(f"[green]Saved JSON results to {path}[/]")
def setup_mla_dims(model_name: str = "deepseek-v3") -> dict:
"""
Get MLA dimensions for known models.
Args:
model_name: Model identifier
Returns:
Dict with MLA dimension configuration
"""
configs = {
"deepseek-v2": {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": 128,
"num_kv_heads": 1,
"head_dim": 576,
},
"deepseek-v3": {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": 128,
"num_kv_heads": 1,
"head_dim": 576,
},
"deepseek-v2-lite": {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": 16,
"num_kv_heads": 1,
"head_dim": 576,
},
}
if model_name not in configs:
raise ValueError(
f"Unknown model '{model_name}'. Known models: {list(configs.keys())}"
)
return configs[model_name]
def get_attention_scale(head_dim: int) -> float:
"""Compute attention scale factor (1/sqrt(d))."""
return 1.0 / math.sqrt(head_dim)
def is_mla_backend(backend: str) -> bool:
"""
Check if backend is an MLA backend using the AttentionBackendEnum.
Args:
backend: Backend name matching AttentionBackendEnum exactly
(e.g., "FLASHMLA_SPARSE")
Returns:
True if the backend is an MLA backend, False otherwise
"""
from vllm.v1.attention.backends.registry import AttentionBackendEnum
try:
backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
return backend_class.is_mla()
except (KeyError, ValueError, ImportError, AttributeError):
return False
# MLA decode-only benchmark configuration
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128 # Base value, can be swept for TP simulation
num_kv_heads: 1 # MLA uses single latent KV
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Small batches, varying sequence lengths
- "16q1s512" # 16 requests, 512 KV cache
- "16q1s1k" # 16 requests, 1k KV cache
- "16q1s2k" # 16 requests, 2k KV cache
- "16q1s4k" # 16 requests, 4k KV cache
# Medium batches
- "32q1s1k" # 32 requests, 1k KV cache
- "32q1s2k" # 32 requests, 2k KV cache
- "32q1s4k" # 32 requests, 4k KV cache
- "32q1s8k" # 32 requests, 8k KV cache
# Large batches
- "64q1s1k" # 64 requests, 1k KV cache
- "64q1s2k" # 64 requests, 2k KV cache
- "64q1s4k" # 64 requests, 4k KV cache
- "64q1s8k" # 64 requests, 8k KV cache
# Very large batches
- "128q1s1k" # 128 requests, 1k KV cache
- "128q1s2k" # 128 requests, 2k KV cache
- "128q1s4k" # 128 requests, 4k KV cache
- "128q1s8k" # 128 requests, 8k KV cache
# Long context
- "32q1s16k" # 32 requests, 16k KV cache
- "32q1s32k" # 32 requests, 32k KV cache
backends:
- CUTLASS_MLA
- FLASHINFER_MLA
- FLASH_ATTN_MLA # Hopper only
- FLASHMLA # Hopper only
device: "cuda:0"
repeats: 100
warmup_iters: 10
profile_memory: true
# Backend-specific tuning
CUTLASS_MLA:
num_kv_splits: auto # or specific value like 4, 8, 16
FLASH_ATTN_MLA:
reorder_batch_threshold: 512
FLASHMLA:
reorder_batch_threshold: 1
# MLA mixed batch benchmark (prefill + decode)
# Tests chunked prefill performance
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128
batch_specs:
# Small prefill + decode
- "1q1k_8q1s1k" # 1 prefill + 8 decode
- "2q2k_16q1s1k" # 2 prefill + 16 decode
- "4q1k_32q1s2k" # 4 prefill + 32 decode
# Medium prefill + decode
- "2q4k_32q1s2k" # 2 medium prefill + 32 decode
- "4q4k_64q1s2k" # 4 medium prefill + 64 decode
- "8q2k_64q1s4k" # 8 prefill + 64 decode
# Large prefill + decode (chunked prefill stress test)
- "2q8k_32q1s1k" # 2 large prefill + 32 decode
- "1q16k_16q1s2k" # 1 very large prefill + 16 decode
- "2q16k_32q1s4k" # 2 very large prefill + 32 decode
# Context extension + decode
- "2q1kkv2k_16q1s1k" # 2 extend + 16 decode
- "4q2kkv4k_32q1s2k" # 4 extend + 32 decode
- "2q1kkv8k_32q1s2k" # 2 large extend + 32 decode
# Explicitly chunked prefill
- "q8k" # 8k prefill with chunking hint
- "q16k" # 16k prefill with chunking hint
- "2q8k_32q1s2k" # 2 chunked prefill + 32 decode
# High decode ratio (realistic serving)
- "1q2k_63q1s1k" # 1 prefill + 63 decode
- "2q2k_62q1s2k" # 2 prefill + 62 decode
- "4q4k_60q1s4k" # 4 prefill + 60 decode
backends:
- CUTLASS_MLA
- FLASHINFER_MLA
- FLASH_ATTN_MLA # Hopper only
- FLASHMLA # Hopper only
device: "cuda:0"
repeats: 5
warmup_iters: 3
profile_memory: true
# Analyze chunked prefill workspace size impact
chunked_prefill:
test_workspace_sizes: [4096, 8192, 16384, 32768, 65536]
# MLA prefill-only benchmark configuration for sparse backends
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
block_size: 128
# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
model_parameter_sweep:
param_name: "num_q_heads"
values: [128, 64, 32, 16]
label_format: "{backend}_{value}h"
batch_specs:
# Pure prefill
- "1q512"
- "1q1k"
- "1q2k"
- "1q4k"
- "1q8k"
# Batched pure prefill
- "2q512"
- "2q1k"
- "2q2k"
- "2q4k"
- "2q8k"
- "4q512"
- "4q1k"
- "4q2k"
- "4q4k"
- "4q8k"
- "8q512"
- "8q1k"
- "8q2k"
- "8q4k"
- "8q8k"
# Extend
- "1q512s4k"
- "1q512s8k"
- "1q1ks8k"
- "1q2ks8k"
- "1q2ks16k"
- "1q4ks16k"
backends:
- FLASHMLA_SPARSE
- FLASHINFER_MLA_SPARSE
device: "cuda:0"
repeats: 10
warmup_iters: 3
profile_memory: true
# Study 4: What is optimal reorder_batch_threshold for MLA backends supporting query length > 1?
# Question: At what query length does prefill pipeline become faster than decode pipeline?
# Methodology: For each query length, compare decode vs prefill performance to find crossover point
# Applies to: FlashAttn MLA, FlashMLA
description: "Decode vs Prefill pipeline crossover analysis"
# Test FlashAttn MLA
backend: FLASH_ATTN_MLA
# Mode: decode_vs_prefill comparison (special sweep mode)
# For each batch spec, we'll test both decode and prefill pipelines
mode: "decode_vs_prefill"
# Query lengths to test (from old benchmark_mla_threshold.py methodology)
# Each query length will be tested with BOTH decode and prefill pipelines:
# - decode: threshold >= query_length (forces decode pipeline)
# - prefill: threshold < query_length (forces prefill pipeline)
#
# We use q<N>s1k format which creates q_len=N, seq_len=1024 requests
# This tests different query lengths with fixed sequence length context
#
# Using batch_spec_ranges for automatic generation:
batch_spec_ranges:
- template: "q{q_len}s1k"
q_len:
start: 1
stop: 16
step: 1
end_inclusive: false
- template: "q{q_len}s1k"
q_len:
start: 16
stop: 64
step: 2
end_inclusive: false
- template: "q{q_len}s1k"
q_len:
start: 64
stop: 1024
step: 4
end_inclusive: true
# Batch sizes to test (from old script)
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
# Model configuration (DeepSeek V2/V3 defaults)
model:
num_layers: 10
head_dim: 576
num_q_heads: 128
num_kv_heads: 1
block_size: 128
# Benchmark settings
device: "cuda:0"
repeats: 15 # More repeats for spec decode variance
warmup_iters: 5
profile_memory: false
# Output
output:
csv: "reorder_threshold_results.csv"
json: "reorder_threshold_results.json"
# Expected outcome (reproduces old benchmark_mla_threshold.py study):
# - For each batch size, find the crossover point where prefill becomes faster than decode
# - Show decode vs prefill performance across all query lengths
# - Determine optimal reorder_batch_threshold based on last query length where decode is faster
# - Understand how crossover point varies with batch size
# - Provide data-driven guidance for default threshold value
#
# Methodology (from old script):
# - Each query length tested with BOTH pipelines:
# * decode: threshold >= query_length (forces decode pipeline)
# * prefill: threshold < query_length (forces prefill pipeline)
# - Compare which is faster to find crossover point
#
# Speculative decoding benchmark configuration
# Tests reorder_batch_threshold optimization
model:
name: "deepseek-v3"
num_layers: 60
num_q_heads: 128
num_kv_heads: 1
head_dim: 576
kv_lora_rank: 512
qk_nope_head_dim: 128
qk_rope_head_dim: 64
v_head_dim: 128
batch_specs:
# Pure speculative decode (K-token verification)
- "q2s1k" # 2-token spec, 1k KV
- "q4s1k" # 4-token spec, 1k KV
- "q8s1k" # 8-token spec, 1k KV
- "q16s1k" # 16-token spec, 1k KV
# Speculative with different context lengths
- "q4s2k" # 4-token spec, 2k KV
- "q4s4k" # 4-token spec, 4k KV
- "q8s2k" # 8-token spec, 2k KV
- "q8s4k" # 8-token spec, 4k KV
# Mixed: speculative + regular decode
- "32q4s1k" # 32 spec requests
- "16q4s1k_16q1s1k" # 16 spec + 16 regular
- "8q8s2k_24q1s2k" # 8 spec (8-tok) + 24 regular
# Mixed: speculative + prefill + decode
- "2q1k_16q4s1k_16q1s1k" # 2 prefill + 16 spec + 16 decode
- "4q2k_32q4s2k_32q1s2k" # 4 prefill + 32 spec + 32 decode
# Large batches with speculation
- "64q4s1k" # 64 spec requests
- "32q8s2k" # 32 spec (8-token)
- "16q16s4k" # 16 spec (16-token)
# Backends that support query length > 1
backends:
- FLASH_ATTN_MLA # reorder_batch_threshold = 512
- FLASHMLA # reorder_batch_threshold = 1 (tunable)
# FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism
# - FLASHINFER_MLA
# Benchmark settings
device: "cuda:0"
repeats: 10 # More repeats for statistical significance
warmup_iters: 5
profile_memory: false
# Test these threshold values for optimization
parameter_sweep:
param_name: "reorder_batch_threshold"
values: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
include_auto: false
label_format: "{backend}_threshold_{value}"
# Standard attention backend benchmark configuration
model:
num_layers: 32
num_q_heads: 32
num_kv_heads: 8 # GQA with 4:1 ratio
head_dim: 128
block_size: 16
batch_specs:
# Pure prefill
- "q512" # Small prefill (512 tokens)
- "q2k" # Medium prefill (2048 tokens)
- "q4k" # Large prefill (4096 tokens)
- "q8k" # Very large prefill (8192 tokens)
# Pure decode
- "8q1s1k" # 8 requests, 1k KV cache each
- "16q1s2k" # 16 requests, 2k KV cache each
- "32q1s1k" # 32 requests, 1k KV cache each
- "64q1s4k" # 64 requests, 4k KV cache each
# Mixed prefill/decode
- "2q2k_8q1s1k" # 2 prefill + 8 decode
- "4q1k_16q1s2k" # 4 prefill + 16 decode
- "2q4k_32q1s1k" # 2 large prefill + 32 decode
# Speculative decode (q <= 8)
- "16q2s1k" # 16 requests, 2 spec tokens, 1k KV cache
- "16q4s1k" # 16 requests, 4 spec tokens, 1k KV cache
- "16q8s1k" # 16 requests, 8 spec tokens, 1k KV cache
- "32q4s2k" # 32 requests, 4 spec tokens, 2k KV cache
- "8q8s4k" # 8 requests, 8 spec tokens, 4k KV cache
# Context extension (chunked prefill)
- "q1ks2k" # 1k query, 2k sequence
- "2q1ks4k" # 2 requests: 1k query, 4k sequence
# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER
backends:
- FLASH_ATTN
- TRITON_ATTN
- FLASHINFER
device: "cuda:0"
repeats: 5
warmup_iters: 3
profile_memory: false
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
MLA benchmark runner - shared utilities for MLA benchmarks.
This module provides helpers for running MLA backends without
needing full VllmConfig integration.
"""
import numpy as np
import torch
from batch_spec import parse_batch_spec
from common import (
BenchmarkResult,
MockHfConfig,
MockIndexer,
MockKVBProj,
MockLayer,
setup_mla_dims,
)
from vllm.config import (
CacheConfig,
CompilationConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
)
# ============================================================================
# VllmConfig Creation
# ============================================================================
def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
"""
Add mock methods for layer-specific queries to ModelConfig.
These methods are needed by metadata builders but aren't normally
present on ModelConfig when used in benchmark contexts.
"""
import types
model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
model_config.get_sliding_window_for_layer = types.MethodType(
lambda self, _i: None, model_config
)
model_config.get_logits_soft_cap_for_layer = types.MethodType(
lambda self, _i: None, model_config
)
model_config.get_sm_scale_for_layer = types.MethodType(
lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
)
def create_minimal_vllm_config(
model_name: str = "deepseek-v3",
block_size: int = 128,
max_num_seqs: int = 256,
mla_dims: dict | None = None,
index_topk: int | None = None,
) -> VllmConfig:
"""
Create minimal VllmConfig for MLA benchmarks.
Args:
model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
provided
block_size: KV cache block size
max_num_seqs: Maximum number of sequences
mla_dims: Optional custom MLA dimensions dict. If not provided, uses
setup_mla_dims(model_name)
index_topk: Optional topk value for sparse MLA backends. If provided,
the config will include index_topk for sparse attention.
Returns:
VllmConfig for benchmarking
"""
# Get MLA dimensions - use provided or load from model name
if mla_dims is None:
mla_dims = setup_mla_dims(model_name)
# Create mock HF config first (avoids downloading from HuggingFace)
mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
# Create a temporary minimal config.json to avoid HF downloads
# This ensures consistent ModelConfig construction without network access
import json
import os
import shutil
import tempfile
minimal_config = {
"architectures": ["DeepseekV2ForCausalLM"],
"model_type": "deepseek_v2",
"num_attention_heads": mla_dims["num_q_heads"],
"num_key_value_heads": mla_dims["num_kv_heads"],
"hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
"torch_dtype": "bfloat16",
"max_position_embeddings": 163840, # DeepSeek V3 default
"rope_theta": 10000.0,
"vocab_size": 128256,
}
# Create temporary directory with config.json
temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
config_path = os.path.join(temp_dir, "config.json")
with open(config_path, "w") as f:
json.dump(minimal_config, f)
try:
# Create model config using local path - no HF downloads
model_config = ModelConfig(
model=temp_dir, # Use local temp directory
tokenizer=None,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16",
seed=0,
max_model_len=32768,
quantization=None,
enforce_eager=False,
max_logprobs=20,
disable_sliding_window=False,
skip_tokenizer_init=True,
served_model_name=None,
limit_mm_per_prompt=None,
config_format="auto",
)
finally:
# Clean up temporary directory
shutil.rmtree(temp_dir, ignore_errors=True)
# Override with our mock config
model_config.hf_config = mock_hf_config
model_config.hf_text_config = mock_hf_config
# Add mock methods for layer-specific queries
_add_mock_methods_to_model_config(model_config)
# Create sub-configs
cache_config = CacheConfig(
block_size=block_size,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
enable_prefix_caching=False,
)
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=8192,
max_model_len=32768,
is_encoder_decoder=False,
enable_chunked_prefill=True,
)
parallel_config = ParallelConfig(
tensor_parallel_size=1,
)
compilation_config = CompilationConfig()
return VllmConfig(
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
compilation_config=compilation_config,
)
# ============================================================================
# Backend Configuration
# ============================================================================
# Backend-specific properties that can't be inferred from the backend class
# Keys are AttentionBackendEnum names (uppercase)
_BACKEND_PROPERTIES = {
"FLASHMLA": {
"query_format": "concat", # Single concatenated tensor (vs tuple)
},
"FLASHMLA_SPARSE": {
"query_format": "concat", # Single concatenated tensor (vs tuple)
},
}
def _get_backend_config(backend: str) -> dict:
"""
Get backend configuration from AttentionBackendEnum.
Uses the registry to get the backend class and extract configuration
from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).
Args:
backend: Backend name matching AttentionBackendEnum exactly
(e.g., "FLASHMLA_SPARSE")
Returns:
Dict with backend configuration
"""
from vllm.v1.attention.backends.registry import AttentionBackendEnum
try:
backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
except (KeyError, ValueError) as e:
valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
raise ValueError(
f"Unknown backend: {backend}. "
f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
) from e
# Get block size from backend class
block_sizes = backend_class.get_supported_kernel_block_sizes()
# Use first supported block size (backends typically support one for MLA)
block_size = block_sizes[0] if block_sizes else None
if hasattr(block_size, "value"):
# Handle MultipleOf enum
block_size = None
# Check if sparse via class method if available
is_sparse = getattr(backend_class, "is_sparse", lambda: False)()
# Get properties that can't be inferred
props = _BACKEND_PROPERTIES.get(backend, {})
return {
"backend_class": backend_class,
"impl_class": backend_class.get_impl_cls(),
"builder_class": backend_class.get_builder_cls(),
"query_format": props.get("query_format", "tuple"),
"block_size": block_size,
"is_sparse": is_sparse,
}
# ============================================================================
# Metadata Building Helpers
# ============================================================================
def _build_attention_metadata(
requests: list,
block_size: int,
device: torch.device,
builder_instance,
) -> tuple:
"""
Build attention metadata from batch requests.
Args:
requests: List of BatchRequest objects
block_size: KV cache block size
device: Target device
builder_instance: Metadata builder instance
Returns:
Tuple of (metadata, kv_cache_num_blocks)
"""
q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens)
max_kv = max(kv_lens)
# Build query start locations
q_start_cpu = torch.tensor(
[0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
dtype=torch.int32,
)
q_start_gpu = q_start_cpu.to(device)
# Build sequence lengths
seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
seq_lens_gpu = seq_lens_cpu.to(device)
# Build num_computed_tokens (context length for each request)
context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)
# Build block table
num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
max_num_blocks = max(num_blocks_per_req)
block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
current_block = 0
for i, num_blocks in enumerate(num_blocks_per_req):
for j in range(num_blocks):
block_table_cpu[i, j] = current_block
current_block += 1
block_table_gpu = torch.from_numpy(block_table_cpu).to(device)
# Build slot mapping
slot_mapping_list = []
for i, (q_len, kv_len, num_blocks) in enumerate(
zip(q_lens, kv_lens, num_blocks_per_req)
):
context_len = kv_len - q_len
for j in range(q_len):
token_kv_idx = context_len + j
block_idx = token_kv_idx // block_size
offset_in_block = token_kv_idx % block_size
global_block_id = block_table_cpu[i, block_idx]
slot_id = global_block_id * block_size + offset_in_block
slot_mapping_list.append(slot_id)
slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)
# Create CommonAttentionMetadata
from vllm.v1.attention.backends.utils import CommonAttentionMetadata
common_attn_metadata = CommonAttentionMetadata(
num_reqs=len(requests),
max_query_len=max(q_lens),
max_seq_len=max_kv,
num_actual_tokens=total_q,
query_start_loc=q_start_gpu,
query_start_loc_cpu=q_start_cpu,
seq_lens=seq_lens_gpu,
_seq_lens_cpu=seq_lens_cpu,
_num_computed_tokens_cpu=num_computed_tokens_cpu,
slot_mapping=slot_mapping,
block_table_tensor=block_table_gpu,
dcp_local_seq_lens=None,
)
# Use the production build() method
metadata = builder_instance.build(
common_prefix_len=0,
common_attn_metadata=common_attn_metadata,
fast_build=False,
)
return metadata, current_block
def _create_input_tensors(
total_q: int,
mla_dims: dict,
query_format: str,
device: torch.device,
dtype: torch.dtype,
):
"""
Create input tensors for both decode and prefill modes.
MLA requires different tensor formats for decode vs prefill:
- Decode: Uses kv_lora_rank (512) dimension
- Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit
Args:
total_q: Total number of query tokens
mla_dims: MLA dimension configuration
query_format: Either "tuple" or "concat"
device: Target device
dtype: Tensor dtype
Returns:
Tuple of (decode_inputs, prefill_inputs)
- decode_inputs: Query tensor(s) for decode mode
- prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
"""
if query_format == "tuple":
# Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
q_nope_decode = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["kv_lora_rank"],
device=device,
dtype=dtype,
)
q_pe = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
decode_inputs = (q_nope_decode, q_pe)
# For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
q_nope_prefill = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["qk_nope_head_dim"],
device=device,
dtype=dtype,
)
prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
else: # concat
decode_inputs = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
# For prefill with concat format
prefill_q = torch.randn(
total_q,
mla_dims["num_q_heads"],
mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
# Create additional inputs needed for prefill forward
k_c_normed = torch.randn(
total_q,
mla_dims["kv_lora_rank"],
device=device,
dtype=dtype,
)
k_pe = torch.randn(
total_q,
1, # Single head for MLA
mla_dims["qk_rope_head_dim"],
device=device,
dtype=dtype,
)
k_scale = torch.ones(1, device=device, dtype=torch.float32)
output = torch.zeros(
total_q,
mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
device=device,
dtype=dtype,
)
prefill_inputs = {
"q": prefill_q,
"k_c_normed": k_c_normed,
"k_pe": k_pe,
"k_scale": k_scale,
"output": output,
}
return decode_inputs, prefill_inputs
# ============================================================================
# Backend Initialization
# ============================================================================
def _create_backend_impl(
backend_cfg: dict,
mla_dims: dict,
vllm_config: VllmConfig,
device: torch.device,
max_num_tokens: int = 8192,
index_topk: int | None = None,
):
"""
Create backend implementation instance.
Args:
backend_cfg: Backend configuration dict from _get_backend_config()
mla_dims: MLA dimension configuration
vllm_config: VllmConfig instance
device: Target device
max_num_tokens: Maximum number of tokens for sparse indexer buffer
index_topk: Topk value for sparse MLA backends
Returns:
Tuple of (impl, layer, builder_instance, indexer)
"""
# Get classes from backend config (already resolved by _get_backend_config)
impl_class = backend_cfg["impl_class"]
builder_class = backend_cfg["builder_class"]
# Calculate scale
scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])
# Create mock kv_b_proj layer for prefill mode
mock_kv_b_proj = MockKVBProj(
num_heads=mla_dims["num_q_heads"],
qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
v_head_dim=mla_dims["v_head_dim"],
)
# Create indexer for sparse backends
indexer = None
if backend_cfg.get("is_sparse", False):
if index_topk is None:
index_topk = 2048 # Default topk for sparse MLA
indexer = MockIndexer(
max_num_tokens=max_num_tokens,
topk_tokens=index_topk,
device=device,
)
# Build impl kwargs
impl_kwargs = {
"num_heads": mla_dims["num_q_heads"],
"head_size": mla_dims["head_dim"],
"scale": scale,
"num_kv_heads": mla_dims["num_kv_heads"],
"alibi_slopes": None,
"sliding_window": None,
"kv_cache_dtype": "auto",
"logits_soft_cap": None,
"attn_type": "decoder",
"kv_sharing_target_layer_name": None,
"q_lora_rank": None,
"kv_lora_rank": mla_dims["kv_lora_rank"],
"qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
"qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
"qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
"v_head_dim": mla_dims["v_head_dim"],
"kv_b_proj": mock_kv_b_proj,
}
# Add indexer for sparse backends
if indexer is not None:
impl_kwargs["indexer"] = indexer
# Create impl
impl = impl_class(**impl_kwargs)
# Initialize DCP attributes
if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
impl.dcp_world_size = 1
impl.dcp_rank = 0
# Create KV cache spec for MockLayer
from vllm.v1.kv_cache_interface import FullAttentionSpec
kv_cache_spec = FullAttentionSpec(
block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
num_kv_heads=1, # MLA uses 1 KV head
head_size=576, # MLA head dim
dtype=torch.bfloat16,
)
# Create mock layer
layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)
# Create builder instance if needed
builder_instance = None
if builder_class:
# Populate static_forward_context so builder can find the layer
# MockLayer inherits from AttentionLayerBase, so isinstance checks pass
vllm_config.compilation_config.static_forward_context = {"placeholder": layer}
builder_instance = builder_class(
kv_cache_spec=kv_cache_spec,
layer_names=["placeholder"],
vllm_config=vllm_config,
device=device,
)
return impl, layer, builder_instance, indexer
# ============================================================================
# Config Helpers
# ============================================================================
def _extract_mla_dims_from_config(config) -> dict | None:
"""
Extract MLA dimensions from BenchmarkConfig if all required fields are present.
Args:
config: BenchmarkConfig instance
Returns:
Dict with MLA dimensions if all fields are provided, None otherwise
"""
# Check if all MLA-specific fields are provided
if all(
[
config.kv_lora_rank is not None,
config.qk_nope_head_dim is not None,
config.qk_rope_head_dim is not None,
config.v_head_dim is not None,
]
):
return {
"kv_lora_rank": config.kv_lora_rank,
"qk_nope_head_dim": config.qk_nope_head_dim,
"qk_rope_head_dim": config.qk_rope_head_dim,
"v_head_dim": config.v_head_dim,
"num_q_heads": config.num_q_heads,
"num_kv_heads": config.num_kv_heads,
"head_dim": config.head_dim,
}
# Fallback: if MLA fields not fully specified, try to construct from basic fields
elif config.head_dim == 576:
# This looks like a DeepSeek MLA config, use standard dimensions with custom
# head count
return {
"kv_lora_rank": 512,
"qk_nope_head_dim": 128,
"qk_rope_head_dim": 64,
"v_head_dim": 128,
"num_q_heads": config.num_q_heads,
"num_kv_heads": config.num_kv_heads,
"head_dim": config.head_dim,
}
return None
# ============================================================================
# Benchmark Execution
# ============================================================================
def _run_single_benchmark(
config,
impl,
layer,
builder_instance,
backend_cfg: dict,
mla_dims: dict,
device: torch.device,
indexer=None,
) -> BenchmarkResult:
"""
Run a single benchmark iteration.
Args:
config: BenchmarkConfig instance
impl: Backend implementation instance
layer: MockLayer instance
builder_instance: Metadata builder instance
backend_cfg: Backend configuration dict
mla_dims: MLA dimension configuration
device: Target device
indexer: Optional MockIndexer for sparse backends
Returns:
BenchmarkResult with timing statistics
"""
# Parse batch spec
requests = parse_batch_spec(config.batch_spec)
q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens)
max_kv_len = max(kv_lens)
# Determine block size
block_size = backend_cfg["block_size"] or config.block_size
# Build metadata
metadata, num_blocks = _build_attention_metadata(
requests, block_size, device, builder_instance
)
# Create KV cache
kv_cache = torch.zeros(
num_blocks,
block_size,
mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
device=device,
dtype=torch.bfloat16,
)
# Create input tensors for both decode and prefill modes
decode_inputs, prefill_inputs = _create_input_tensors(
total_q,
mla_dims,
backend_cfg["query_format"],
device,
torch.bfloat16,
)
# Fill indexer with random indices for sparse backends
is_sparse = backend_cfg.get("is_sparse", False)
if is_sparse and indexer is not None:
indexer.fill_random_indices(total_q, max_kv_len)
# Determine which forward method to use
if is_sparse:
# Sparse backends use forward_mqa
forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
elif metadata.decode is not None:
forward_fn = lambda: impl._forward_decode(
decode_inputs, kv_cache, metadata, layer
)
elif metadata.prefill is not None:
forward_fn = lambda: impl._forward_prefill(
prefill_inputs["q"],
prefill_inputs["k_c_normed"],
prefill_inputs["k_pe"],
kv_cache,
metadata,
prefill_inputs["k_scale"],
prefill_inputs["output"],
)
else:
raise RuntimeError("Metadata has neither decode nor prefill metadata")
# Warmup
for _ in range(config.warmup_iters):
forward_fn()
torch.cuda.synchronize()
# Benchmark
times = []
for _ in range(config.repeats):
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(config.num_layers):
forward_fn()
end.record()
torch.cuda.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers)
mean_time = float(np.mean(times))
return BenchmarkResult(
config=config,
mean_time=mean_time,
std_time=float(np.std(times)),
min_time=float(np.min(times)),
max_time=float(np.max(times)),
throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
)
def _run_mla_benchmark_batched(
backend: str,
configs_with_params: list[tuple], # [(config, threshold, num_splits), ...]
index_topk: int = 2048,
) -> list[BenchmarkResult]:
"""
Unified batched MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
flashinfer_mla_sparse, flashmla_sparse
This function reuses backend initialization across multiple benchmarks
to avoid setup/teardown overhead.
Args:
backend: Backend name
configs_with_params: List of (config, threshold, num_splits) tuples
- threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
- num_splits: num_kv_splits (CUTLASS only)
index_topk: Topk value for sparse MLA backends (default 2048)
Returns:
List of BenchmarkResult objects
"""
if not configs_with_params:
return []
backend_cfg = _get_backend_config(backend)
device = torch.device(configs_with_params[0][0].device)
torch.cuda.set_device(device)
# Determine block size
config_block_size = configs_with_params[0][0].block_size
block_size = backend_cfg["block_size"] or config_block_size
# Extract MLA dimensions from the first config
first_config = configs_with_params[0][0]
mla_dims = _extract_mla_dims_from_config(first_config)
# If config didn't provide MLA dims, fall back to default model
if mla_dims is None:
mla_dims = setup_mla_dims("deepseek-v3")
# Determine if this is a sparse backend
is_sparse = backend_cfg.get("is_sparse", False)
# Create and set vLLM config for MLA (reused across all benchmarks)
vllm_config = create_minimal_vllm_config(
model_name="deepseek-v3", # Used only for model path
block_size=block_size,
mla_dims=mla_dims, # Use custom dims from config or default
index_topk=index_topk if is_sparse else None,
)
results = []
with set_current_vllm_config(vllm_config):
# Create backend impl, layer, builder, and indexer (reused across benchmarks)
impl, layer, builder_instance, indexer = _create_backend_impl(
backend_cfg,
mla_dims,
vllm_config,
device,
index_topk=index_topk if is_sparse else None,
)
# Run each benchmark with the shared impl
for config, threshold, num_splits in configs_with_params:
# Set threshold for this benchmark (FlashAttn/FlashMLA only)
original_threshold = None
if threshold is not None and builder_instance:
original_threshold = builder_instance.reorder_batch_threshold
builder_instance.reorder_batch_threshold = threshold
# Set num_splits for CUTLASS
original_num_splits = None
if num_splits is not None and hasattr(impl, "_num_kv_splits"):
original_num_splits = impl._num_kv_splits
impl._num_kv_splits = num_splits
try:
result = _run_single_benchmark(
config,
impl,
layer,
builder_instance,
backend_cfg,
mla_dims,
device,
indexer=indexer,
)
results.append(result)
finally:
# Restore original threshold
if original_threshold is not None:
builder_instance.reorder_batch_threshold = original_threshold
# Restore original num_splits
if original_num_splits is not None:
impl._num_kv_splits = original_num_splits
return results
# ============================================================================
# Public API
# ============================================================================
def run_mla_benchmark(
backend: str,
config,
reorder_batch_threshold: int | None = None,
num_kv_splits: int | None = None,
index_topk: int = 2048,
) -> BenchmarkResult | list[BenchmarkResult]:
"""
Unified MLA benchmark runner for all backends.
Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
flashinfer_mla_sparse, flashmla_sparse
Always uses batched execution internally for optimal performance.
Args:
backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
flashinfer_mla_sparse, flashmla_sparse)
config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
(single config mode only)
num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
index_topk: Topk value for sparse MLA backends (default 2048)
Returns:
BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
"""
# Normalize to batched mode: (config, threshold, num_splits)
if isinstance(config, list):
# Already in batched format
if len(config) > 0 and isinstance(config[0], tuple):
# Format: [(cfg, param), ...] where param is threshold or num_splits
if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
configs_with_params = [(cfg, param, None) for cfg, param in config]
else: # cutlass_mla, flashinfer_mla, or sparse backends
configs_with_params = [(cfg, None, param) for cfg, param in config]
else:
# Format: [cfg, ...] - just configs
configs_with_params = [(cfg, None, None) for cfg in config]
return_single = False
else:
# Single config: convert to batched format
configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
return_single = True
# Use unified batched execution
results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk)
# Return single result or list based on input
return results[0] if return_single else results
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Standard attention benchmark runner - shared utilities for non-MLA benchmarks.
This module provides helpers for running standard attention backends
(FlashAttention, Triton, FlashInfer) with real vLLM integration.
"""
import logging
import types
from contextlib import contextmanager
import numpy as np
import torch
from batch_spec import parse_batch_spec, reorder_for_flashinfer
from common import BenchmarkConfig, BenchmarkResult, MockLayer, get_attention_scale
from vllm.config import (
CacheConfig,
CompilationConfig,
DeviceConfig,
LoadConfig,
ModelConfig,
ParallelConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
)
from vllm.v1.attention.backends.utils import (
CommonAttentionMetadata,
get_kv_cache_layout,
set_kv_cache_layout,
)
from vllm.v1.kv_cache_interface import FullAttentionSpec
# ============================================================================
# Backend Configuration
# ============================================================================
def _get_backend_config(backend: str) -> dict:
"""
Get backend configuration from AttentionBackendEnum.
Args:
backend: Backend name matching AttentionBackendEnum exactly
(e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER")
Returns:
Dict with backend_class
"""
from vllm.v1.attention.backends.registry import AttentionBackendEnum
try:
backend_enum = AttentionBackendEnum[backend]
backend_class = backend_enum.get_class()
except (KeyError, ValueError) as e:
valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"]
raise ValueError(
f"Unknown backend: {backend}. Valid backends: {valid_backends}"
) from e
return {"backend_class": backend_class}
@contextmanager
def log_warnings_and_errors_only():
"""Temporarily set vLLM logger to WARNING level."""
logger = logging.getLogger("vllm")
old_level = logger.level
logger.setLevel(logging.WARNING)
try:
yield
finally:
logger.setLevel(old_level)
# ============================================================================
# Metadata Building Helpers
# ============================================================================
def _build_common_attn_metadata(
q_lens: list[int],
kv_lens: list[int],
block_size: int,
device: torch.device,
) -> CommonAttentionMetadata:
"""Build CommonAttentionMetadata from query/kv lengths."""
batch_size = len(q_lens)
total_tokens = sum(q_lens)
query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device)
query_start_loc[1:] = torch.tensor(q_lens, dtype=torch.int32, device=device).cumsum(
0
)
query_start_loc_cpu = query_start_loc.cpu()
seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device)
max_seq_len = int(seq_lens.max().item())
max_blocks = (max(kv_lens) + block_size - 1) // block_size
num_blocks = batch_size * max_blocks
block_table_tensor = torch.arange(
num_blocks, dtype=torch.int32, device=device
).view(batch_size, max_blocks)
slot_mapping = torch.arange(total_tokens, dtype=torch.int64, device=device)
max_query_len = max(q_lens)
return CommonAttentionMetadata(
query_start_loc=query_start_loc,
query_start_loc_cpu=query_start_loc_cpu,
seq_lens=seq_lens,
num_reqs=batch_size,
num_actual_tokens=total_tokens,
max_query_len=max_query_len,
max_seq_len=max_seq_len,
block_table_tensor=block_table_tensor,
slot_mapping=slot_mapping,
causal=True,
)
def _create_vllm_config(
config: BenchmarkConfig,
max_num_blocks: int,
) -> VllmConfig:
"""Create a VllmConfig for benchmarking with mock model methods."""
model_config = ModelConfig(
model="meta-llama/Meta-Llama-3-8B",
tokenizer="meta-llama/Meta-Llama-3-8B",
trust_remote_code=False,
dtype="auto", # Use model's native dtype
seed=0,
max_model_len=1024,
)
cache_config = CacheConfig(
block_size=config.block_size,
cache_dtype="auto",
swap_space=0,
)
cache_config.num_gpu_blocks = max_num_blocks
cache_config.num_cpu_blocks = 0
parallel_config = ParallelConfig(tensor_parallel_size=1)
scheduler_config = SchedulerConfig(
max_num_seqs=256,
max_num_batched_tokens=8192,
max_model_len=8192,
is_encoder_decoder=False,
enable_chunked_prefill=True,
)
device_config = DeviceConfig()
load_config = LoadConfig()
compilation_config = CompilationConfig()
# Add mock methods for benchmark config values
model_config.get_num_layers = types.MethodType(
lambda self: config.num_layers, model_config
)
model_config.get_sliding_window_for_layer = types.MethodType(
lambda self, i: None, model_config
)
model_config.get_logits_soft_cap_for_layer = types.MethodType(
lambda self, i: 0.0, model_config
)
model_config.get_sm_scale_for_layer = types.MethodType(
lambda self, i: 1.0 / config.head_dim**0.5, model_config
)
model_config.get_num_attention_heads = types.MethodType(
lambda self, parallel_config=None: config.num_q_heads, model_config
)
model_config.get_num_kv_heads = types.MethodType(
lambda self, parallel_config=None: config.num_kv_heads, model_config
)
model_config.get_head_size = types.MethodType(
lambda self: config.head_dim, model_config
)
model_config.get_sliding_window = types.MethodType(lambda self: None, model_config)
return VllmConfig(
model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
device_config=device_config,
load_config=load_config,
compilation_config=compilation_config,
)
# ============================================================================
# Backend Initialization
# ============================================================================
def _create_backend_impl(
backend_cfg: dict,
config: BenchmarkConfig,
device: torch.device,
dtype: torch.dtype,
):
"""Create backend implementation instance."""
backend_class = backend_cfg["backend_class"]
scale = get_attention_scale(config.head_dim)
impl = backend_class.get_impl_cls()(
num_heads=config.num_q_heads,
head_size=config.head_dim,
scale=scale,
num_kv_heads=config.num_kv_heads,
alibi_slopes=None,
sliding_window=None,
kv_cache_dtype="auto",
)
kv_cache_spec = FullAttentionSpec(
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
dtype=dtype,
)
layer = MockLayer(device, kv_cache_spec=kv_cache_spec)
return backend_class, impl, layer
def _create_metadata_builder(
backend_class,
kv_cache_spec: FullAttentionSpec,
vllm_config: VllmConfig,
device: torch.device,
backend_name: str = "",
):
"""Create metadata builder instance."""
layer_names = ["layer_0"]
builder_cls = backend_class.get_builder_cls()
# Flashinfer needs get_per_layer_parameters mocked since we don't have
# real model layers registered
if backend_name == "FLASHINFER":
import unittest.mock
from vllm.v1.attention.backends.utils import PerLayerParameters
def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls):
head_size = vllm_config.model_config.get_head_size()
return {
layer_name: PerLayerParameters(
window_left=-1, # No sliding window
logits_soft_cap=0.0, # No soft cap
sm_scale=1.0 / (head_size**0.5), # Standard scale
)
for layer_name in layer_names
}
with unittest.mock.patch(
"vllm.v1.attention.backends.flashinfer.get_per_layer_parameters",
mock_get_per_layer_parameters,
):
return builder_cls(
kv_cache_spec=kv_cache_spec,
layer_names=layer_names,
vllm_config=vllm_config,
device=device,
)
return builder_cls(
kv_cache_spec=kv_cache_spec,
layer_names=layer_names,
vllm_config=vllm_config,
device=device,
)
# ============================================================================
# Tensor Creation Helpers
# ============================================================================
def _create_input_tensors(
config: BenchmarkConfig,
total_q: int,
device: torch.device,
dtype: torch.dtype,
) -> tuple:
"""Create Q, K, V input tensors for all layers."""
q_list = [
torch.randn(
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
)
for _ in range(config.num_layers)
]
k_list = [
torch.randn(
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
)
for _ in range(config.num_layers)
]
v_list = [
torch.randn(
total_q, config.num_kv_heads, config.head_dim, device=device, dtype=dtype
)
for _ in range(config.num_layers)
]
return q_list, k_list, v_list
def _create_kv_cache(
config: BenchmarkConfig,
max_num_blocks: int,
backend_class,
device: torch.device,
dtype: torch.dtype,
) -> list:
"""Create KV cache tensors for all layers using the backend's methods.
Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order()
to create the cache with the correct shape and memory layout.
"""
# Get the logical shape from the backend
cache_shape = backend_class.get_kv_cache_shape(
num_blocks=max_num_blocks,
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
)
# Get the stride order for custom memory layout
try:
stride_order = backend_class.get_kv_cache_stride_order()
assert len(stride_order) == len(cache_shape)
except (AttributeError, NotImplementedError):
stride_order = tuple(range(len(cache_shape)))
# Permute shape to physical layout order
physical_shape = tuple(cache_shape[i] for i in stride_order)
# Compute inverse permutation to get back to logical view
inv_order = [stride_order.index(i) for i in range(len(stride_order))]
cache_list = []
for _ in range(config.num_layers):
# Allocate in physical layout order (contiguous in memory)
cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
# Permute to logical view
cache = cache.permute(*inv_order)
cache_list.append(cache)
return cache_list
# ============================================================================
# Benchmark Execution
# ============================================================================
def _run_single_benchmark(
config: BenchmarkConfig,
impl,
layer,
q_list: list,
k_list: list,
v_list: list,
cache_list: list,
attn_metadata,
device: torch.device,
dtype: torch.dtype,
) -> tuple:
"""Run single benchmark iteration with warmup and timing loop."""
total_q = q_list[0].shape[0]
out = torch.empty(
total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
)
# Warmup
for _ in range(config.warmup_iters):
for i in range(config.num_layers):
impl.forward(
layer,
q_list[i],
k_list[i],
v_list[i],
cache_list[i],
attn_metadata,
output=out,
)
torch.cuda.synchronize()
# Benchmark
times = []
for _ in range(config.repeats):
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for i in range(config.num_layers):
impl.forward(
layer,
q_list[i],
k_list[i],
v_list[i],
cache_list[i],
attn_metadata,
output=out,
)
end.record()
torch.cuda.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
mem_stats = {}
if config.profile_memory:
mem_stats = {
"allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
"reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
}
return times, mem_stats
# ============================================================================
# Public API
# ============================================================================
def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
"""
Run standard attention benchmark with real kernels.
Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER
Args:
config: Benchmark configuration
Returns:
BenchmarkResult with timing and memory statistics
"""
device = torch.device(config.device)
torch.cuda.set_device(device)
backend_cfg = _get_backend_config(config.backend)
requests = parse_batch_spec(config.batch_spec)
if config.backend == "FLASHINFER":
requests = reorder_for_flashinfer(requests)
q_lens = [r.q_len for r in requests]
kv_lens = [r.kv_len for r in requests]
total_q = sum(q_lens)
max_kv = max(kv_lens)
batch_size = len(q_lens)
# Calculate total blocks needed: batch_size * max_blocks_per_request
max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size
max_num_blocks = batch_size * max_blocks_per_request
# Suppress vLLM logs during setup to reduce spam
with log_warnings_and_errors_only():
# Create vllm_config first - uses model's native dtype via "auto"
vllm_config = _create_vllm_config(config, max_num_blocks)
dtype = vllm_config.model_config.dtype
# Wrap everything in set_current_vllm_config context
# This is required for backends like flashinfer that need global config
with set_current_vllm_config(vllm_config):
backend_class, impl, layer = _create_backend_impl(
backend_cfg, config, device, dtype
)
# Set KV cache layout if the backend requires a specific one
# (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention)
required_layout = backend_class.get_required_kv_cache_layout()
if required_layout is not None:
set_kv_cache_layout(required_layout)
get_kv_cache_layout.cache_clear()
common_metadata = _build_common_attn_metadata(
q_lens, kv_lens, config.block_size, device
)
kv_cache_spec = FullAttentionSpec(
block_size=config.block_size,
num_kv_heads=config.num_kv_heads,
head_size=config.head_dim,
dtype=dtype,
)
builder = _create_metadata_builder(
backend_class, kv_cache_spec, vllm_config, device, config.backend
)
attn_metadata = builder.build(
common_prefix_len=0,
common_attn_metadata=common_metadata,
)
q_list, k_list, v_list = _create_input_tensors(
config, total_q, device, dtype
)
cache_list = _create_kv_cache(
config, max_num_blocks, backend_class, device, dtype
)
times, mem_stats = _run_single_benchmark(
config,
impl,
layer,
q_list,
k_list,
v_list,
cache_list,
attn_metadata,
device,
dtype,
)
mean_time = np.mean(times)
throughput = total_q / mean_time if mean_time > 0 else 0
return BenchmarkResult(
config=config,
mean_time=mean_time,
std_time=np.std(times),
min_time=np.min(times),
max_time=np.max(times),
throughput_tokens_per_sec=throughput,
memory_allocated_mb=mem_stats.get("allocated_mb"),
memory_reserved_mb=mem_stats.get("reserved_mb"),
)
# Automated vLLM Server Parameter Tuning
This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
## Table of Contents
- [Prerequisites](#prerequisites)
- [Configuration](#configuration)
- [How to Run](#how-to-run)
- [Example Use Cases](#example-use-cases)
- [Output](#output)
- [How It Works](#how-it-works)
## Prerequisites
Before running the script, please ensure the following steps are completed:
1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch.
```bash
git clone https://github.com/vllm-project/vllm.git
cd vllm
# git checkout <your-branch>
```
1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions.
2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible.
## Configuration
You must set the following variables at the top of the script before execution.
Note: You can also override the default values below via environment variables when running the script.
```bash
MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
```
| Variable | Description | Example Value |
| --- | --- | --- |
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` |
| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` |
| `TP` | **Required.** The tensor-parallelism size. | `1` |
| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) |
| `INPUT_LEN` | **Required.** Request input length. | `4000` |
| `OUTPUT_LEN` | **Required.** Request output length. | `16` |
| `MAX_MODEL_LEN` | **Required.** Max model length. | `4096` |
| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` |
| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` |
| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` |
| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` |
**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`.
## How to Run
1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
```bash
cd <FOLDER_OF_THIS_SCRIPT>
bash auto_tune.sh
```
Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself.
## Example Use Cases
Here are a few examples of how to configure the script for different goals:
### 1. Maximize Throughput (No Latency Constraint)
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
- **Configuration**:
```bash
INPUT_LEN=1800
OUTPUT_LEN=20
MAX_MODEL_LEN=2048
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
```
### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**:
```bash
INPUT_LEN=1800
OUTPUT_LEN=20
MAX_MODEL_LEN=2048
MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=500
```
### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**:
```bash
INPUT_LEN=1800
OUTPUT_LEN=20
MAX_MODEL_LEN=2048
MIN_CACHE_HIT_PCT=60
MAX_LATENCY_ALLOWED_MS=500
```
## Output
After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`.
- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run:
- `vllm_log_...txt`: The log output from the vLLM server for each parameter combination.
- `bm_log_...txt`: The log output from the `vllm bench serve` command for each benchmark run.
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
```text
# Example result.txt content
hash:a1b2c3d4...
max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8
max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500
...
best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile
```
If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict.
- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run.
## How It Works
The script follows a systematic process to find the optimal parameters:
1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing.
2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists.
3. **Latency-Aware Throughput Search**: For each parameter combination:
- The vLLM server is started.
- A benchmark is first run with an infinite request rate (`--request-rate inf`).
- If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration.
- If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement.
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
## Batched `auto_tune`
The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
### Prerequisites
- **jq**: This script requires `jq` to parse the JSON configuration file.
- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
### How to Run
1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
2. **Execute the script**:
```bash
bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
```
- `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
- `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
### Configuration File
The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
Here is an example `runs_config.json` with two benchmark configurations:
```json
[
{
"base": "/home/user",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"system": "TPU", # OR GPU
"tp": 8,
"input_len": 128,
"output_len": 2048,
"max_model_len": 2300,
"num_seqs_list": "128 256",
"num_batched_tokens_list": "8192 16384"
},
{
"base": "/home/user",
"model": "meta-llama/Llama-3.1-70B-Instruct",
"system": "TPU", # OR GPU
"tp": 8,
"input_len": 4000,
"output_len": 16,
"max_model_len": 4096,
"num_seqs_list": "64 128",
"num_batched_tokens_list": "4096 8192",
"max_latency_allowed_ms": 500
}
]
```
### Output
The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
- `run_id`: A unique identifier for the run, derived from the timestamp.
- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
A summary of successful and failed runs is also printed to the console upon completion.
#!/bin/bash
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
# See details in README (benchmarks/auto_tune/README.md).
TAG=$(date +"%Y_%m_%d_%H_%M")
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
BASE=${BASE:-"$SCRIPT_DIR/../../.."}
MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
SYSTEM=${SYSTEM:-"TPU"}
TP=${TP:-1}
DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
INPUT_LEN=${INPUT_LEN:-4000}
OUTPUT_LEN=${OUTPUT_LEN:-16}
MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
HOSTNAME=$(hostname)
if [[ -z "$HOSTNAME" ]]; then
echo "Error: Failed to determine hostname." >&2
exit 1
fi
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
PROFILE_PATH="$LOG_FOLDER/profile"
echo "====================== AUTO TUNE PARAMETERS ===================="
echo "SCRIPT_DIR=$SCRIPT_DIR"
echo "BASE=$BASE"
echo "MODEL=$MODEL"
echo "SYSTEM=$SYSTEM"
echo "TP=$TP"
echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
echo "INPUT_LEN=$INPUT_LEN"
echo "OUTPUT_LEN=$OUTPUT_LEN"
echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
echo "RESULT_FILE=$RESULT"
echo "====================== AUTO TUNEPARAMETERS ===================="
rm -rf "$LOG_FOLDER"
rm -rf "$PROFILE_PATH"
mkdir -p "$LOG_FOLDER"
mkdir -p "$PROFILE_PATH"
cd "$BASE/vllm"
pip install -q datasets
current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT"
echo "current_hash: $current_hash"
TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
RED='\033[0;31m'
if (( TOTAL_LEN > MAX_MODEL_LEN )); then
echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
exit 1
fi
best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0
best_request_rate=0
start_server() {
local gpu_memory_utilization=$1
local max_num_seqs=$2
local max_num_batched_tokens=$3
local vllm_log=$4
local profile_dir=$5
pkill -if "vllm serve" || true
# Define the common arguments as a bash array.
# Each argument and its value are separate elements.
local common_args_array=(
"$MODEL"
"--port" "8004"
"--host" "$HOSTNAME"
"--gpu-memory-utilization" "$gpu_memory_utilization"
"--max-num-seqs" "$max_num_seqs"
"--max-num-batched-tokens" "$max_num_batched_tokens"
"--tensor-parallel-size" "$TP"
"--enable-prefix-caching"
"--load-format" "dummy"
"--download-dir" "$DOWNLOAD_DIR"
"--max-model-len" "$MAX_MODEL_LEN"
)
# Use the array expansion "${common_args_array[@]}"
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
VLLM_SERVER_DEV_MODE=1 \
vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
local server_pid=$!
# wait for 10 minutes...
server_started=0
for _ in {1..60}; do
# This line checks whether the server is still alive or not,
# since that we should always have permission to send signal to the server process.
kill -0 $server_pid 2> /dev/null || break
RESPONSE=$(curl -s -X GET "http://${HOSTNAME}:8004/health" -w "%{http_code}" -o /dev/stdout)
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1
break
else
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
return 1
else
return 0
fi
}
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo
rm -f "$vllm_log"
pkill -if "vllm serve" || true
echo "starting server..."
# Call start_server without a profile_dir to avoid profiling overhead
start_server "$gpu_memory_utilization" "$max_num_seqs" "$max_num_batched_tokens" "$vllm_log" ""
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
else
echo "server started."
fi
echo
echo "run benchmark test..."
meet_latency_requirement=0
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
# --profile flag is removed from this call
vllm bench serve \
--backend vllm \
--model "$MODEL" \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len "$OUTPUT_LEN" \
--ignore-eos \
--disable-tqdm \
--request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1
request_rate=inf
fi
if (( ! meet_latency_requirement )); then
# start from request-rate as int(throughput) + 1
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
# clear prefix cache
curl -X POST http://"${HOSTNAME}":8004/reset_prefix_cache
sleep 5
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
vllm bench serve \
--backend vllm \
--model "$MODEL" \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len "$OUTPUT_LEN" \
--ignore-eos \
--disable-tqdm \
--request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1
break
fi
request_rate=$((request_rate-1))
done
fi
# write the results and update the best result.
if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
best_throughput=$throughput
best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput
best_request_rate=$request_rate
fi
else
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
fi
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
pkill -if "vllm serve" || true
sleep 10
echo "===================="
return 0
}
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
# first find out the max gpu-memory-utilization without HBM OOM.
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
# Pass empty string for profile_dir argument
start_server "$gpu_memory_utilization" "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
break
else
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
fi
done
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
exit 1
fi
for num_seqs in "${num_seqs_list[@]}"; do
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
run_benchmark "$num_seqs" "$num_batched_tokens" "$gpu_memory_utilization"
done
done
echo "finish permutations"
# =================================================================================
# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
# =================================================================================
if (( $(echo "$best_throughput > 0" | bc -l) )); then
echo
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput, goodput: $best_goodput"
echo
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
# Start server with the best params and profiling ENABLED
echo "Starting server for profiling..."
start_server "$gpu_memory_utilization" "$best_max_num_seqs" "$best_num_batched_tokens" "$vllm_log" "$PROFILE_PATH"
# Run benchmark with the best params and the --profile flag
echo "Running benchmark with profiling..."
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
vllm bench serve \
--backend vllm \
--model "$MODEL" \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len "$OUTPUT_LEN" \
--ignore-eos \
--disable-tqdm \
--request-rate "$best_request_rate" \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:"$MAX_LATENCY_ALLOWED_MS" \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--host "$HOSTNAME" \
--port 8004 \
--profile &> "$bm_log"
else
echo "No configuration met the latency requirements. Skipping final profiling run."
fi
pkill -if "vllm serve" || true
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
#!/bin/bash
INPUT_JSON="$1"
GCS_PATH="$2" # Optional GCS path for uploading results for each run
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
if [[ -z "$INPUT_JSON" ]]; then
echo "Error: Input JSON file not provided."
echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
exit 1
fi
if [[ ! -f "$INPUT_JSON" ]]; then
echo "Error: File not found at '$INPUT_JSON'"
exit 1
fi
if ! command -v jq &> /dev/null; then
echo "Error: 'jq' command not found. Please install jq to process the JSON input."
exit 1
fi
if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
exit 1
fi
SUCCESS_COUNT=0
FAILURE_COUNT=0
FAILED_RUNS=()
SCRIPT_START_TIME=$(date +%s)
json_content=$(cat "$INPUT_JSON")
if ! num_runs=$(echo "$json_content" | jq 'length'); then
echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
exit 1
fi
echo "Found $num_runs benchmark configurations in $INPUT_JSON."
echo "Starting benchmark runs..."
echo "--------------------------------------------------"
for i in $(seq 0 $(($num_runs - 1))); do
run_object=$(echo "$json_content" | jq ".[$i]")
RUN_START_TIME=$(date +%s)
ENV_VARS_ARRAY=()
# Dynamically create env vars from the JSON object's keys
for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
value=$(echo "$run_object" | jq -r ".$key")
var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
ENV_VARS_ARRAY+=("${var_name}=${value}")
done
echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
# Execute auto_tune.sh and capture output
RUN_OUTPUT_FILE=$(mktemp)
if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
STATUS="SUCCESS"
((SUCCESS_COUNT++))
else
STATUS="FAILURE"
((FAILURE_COUNT++))
FAILED_RUNS+=("Run #$((i+1)): $(echo "$run_object" | jq -c .)")
fi
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
rm "$RUN_OUTPUT_FILE"
# Parse results and optionally upload them to GCS
RUN_ID=""
RESULTS=""
GCS_RESULTS_URL=""
if [[ "$STATUS" == "SUCCESS" ]]; then
RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
RESULTS=$(cat "$RESULT_FILE_PATH")
if [[ -n "$GCS_PATH" ]]; then
GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
echo "Uploading results to GCS..."
if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
echo "GCS upload successful."
else
echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
fi
fi
else
echo "Warning: Could not find result file for a successful run."
STATUS="WARNING_NO_RESULT_FILE"
fi
fi
# Add the results back into the JSON object for this run
json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
'.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
RUN_END_TIME=$(date +%s)
echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
echo "--------------------------------------------------"
# Save intermediate progress back to the file
echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
done
SCRIPT_END_TIME=$(date +%s)
echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
echo
echo "====================== SUMMARY ======================"
echo "Successful runs: $SUCCESS_COUNT"
echo "Failed runs: $FAILURE_COUNT"
echo "==================================================="
if [[ $FAILURE_COUNT -gt 0 ]]; then
echo "Details of failed runs (see JSON file for full parameters):"
for failed in "${FAILED_RUNS[@]}"; do
echo " - $failed"
done
fi
echo "Updated results have been saved to '$INPUT_JSON'."
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field
import aiohttp
import huggingface_hub.constants
from tqdm.asyncio import tqdm
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
# NOTE(simon): do not import vLLM here so the benchmark script
# can run without vLLM installed.
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@dataclass
class RequestFuncInput:
prompt: str
api_url: str
prompt_len: int
output_len: int
model: str
model_name: str | None = None
logprobs: int | None = None
extra_body: dict | None = None
multi_modal_content: dict | list[dict] | None = None
ignore_eos: bool = False
language: str | None = None
request_id: str | None = None
@dataclass
class RequestFuncOutput:
generated_text: str = ""
success: bool = False
latency: float = 0.0
output_tokens: int = 0
ttft: float = 0.0 # Time to first token
itl: list[float] = field(default_factory=list) # list of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
error: str = ""
async def async_request_tgi(
request_func_input: RequestFuncInput,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(
trust_env=True, timeout=AIOHTTP_TIMEOUT
) as session:
params = {
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
"truncate": request_func_input.prompt_len,
"ignore_eos_token": request_func_input.ignore_eos,
}
payload = {
"inputs": request_func_input.prompt,
"parameters": params,
}
headers = None
if request_func_input.request_id:
headers = {"x-request-id": request_func_input.request_id}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
if request_func_input.ignore_eos:
output.output_tokens = request_func_input.output_len
else:
output.output_tokens = None
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(
url=api_url, json=payload, headers=headers
) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_trt_llm(
request_func_input: RequestFuncInput,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
async with aiohttp.ClientSession(
trust_env=True, timeout=AIOHTTP_TIMEOUT
) as session:
payload = {
"accumulate_tokens": True,
"text_input": request_func_input.prompt,
"temperature": 0.0,
"top_p": 1.0,
"max_tokens": request_func_input.output_len,
"stream": True,
}
if request_func_input.ignore_eos:
payload["min_length"] = request_func_input.output_len
headers = None
if request_func_input.request_id:
headers = {"x-request-id": request_func_input.request_id}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(
url=api_url, json=payload, headers=headers
) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
output.success = True
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_deepspeed_mii(
request_func_input: RequestFuncInput,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
"OpenAI Completions API URL must end with 'completions' or 'profile'."
)
async with aiohttp.ClientSession(
trust_env=True, timeout=AIOHTTP_TIMEOUT
) as session:
payload = {
"model": request_func_input.model,
"prompt": request_func_input.prompt,
"max_tokens": request_func_input.output_len,
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
"top_p": 1.0,
}
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
if request_func_input.request_id:
headers["x-request-id"] = request_func_input.request_id
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
output.ttft = 0
st = time.perf_counter()
try:
async with session.post(
url=api_url, json=payload, headers=headers
) as response:
if response.status == 200:
parsed_resp = await response.json()
output.latency = time.perf_counter() - st
if "choices" in parsed_resp:
output.generated_text = parsed_resp["choices"][0]["text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
output.error = (
"Unexpected response format: "
"neither 'choices' nor 'text' found"
)
output.success = False
output.success = True
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_completions(
request_func_input: RequestFuncInput,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("completions", "profile")), (
"OpenAI Completions API URL must end with 'completions' or 'profile'."
)
async with aiohttp.ClientSession(
trust_env=True, timeout=AIOHTTP_TIMEOUT
) as session:
payload = {
"model": request_func_input.model_name
if request_func_input.model_name
else request_func_input.model,
"prompt": request_func_input.prompt,
"temperature": 0.0,
"repetition_penalty": 1.0,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
"stream_options": {
"include_usage": True,
},
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
if request_func_input.request_id:
headers["x-request-id"] = request_func_input.request_id
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(
url=api_url, json=payload, headers=headers
) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if choices := data.get("choices"):
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += text or ""
if usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!"
)
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), (
"OpenAI Chat Completions API URL must end with 'chat/completions'."
)
async with aiohttp.ClientSession(
trust_env=True, timeout=AIOHTTP_TIMEOUT
) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
mm_content = request_func_input.multi_modal_content
if isinstance(mm_content, list):
content.extend(mm_content)
elif isinstance(mm_content, dict):
content.append(mm_content)
else:
raise TypeError(
"multi_modal_content must be a dict or list[dict] for openai-chat"
)
payload = {
"model": request_func_input.model_name
if request_func_input.model_name
else request_func_input.model,
"messages": [
{"role": "user", "content": content},
],
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"stream_options": {
"include_usage": True,
},
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
if request_func_input.request_id:
headers["x-request-id"] = request_func_input.request_id
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(
url=api_url, json=payload, headers=headers
) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: SSE comments (often used as pings) start with a colon.
# These are not JSON data payload and should be skipped.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp - most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get("completion_tokens")
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_audio(
request_func_input: RequestFuncInput,
pbar: tqdm | None = None,
) -> RequestFuncOutput:
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
api_url = request_func_input.api_url
assert api_url.endswith(("transcriptions", "translations")), (
"OpenAI Chat Completions API URL must end with 'transcriptions' "
)
"or `translations`."
async with aiohttp.ClientSession(
trust_env=True, timeout=AIOHTTP_TIMEOUT
) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
payload = {
"model": request_func_input.model_name
if request_func_input.model_name
else request_func_input.model,
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"language": "en",
# Flattened due to multipart/form-data
"stream_include_usage": True,
"stream_continuous_usage_stats": True,
}
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
if request_func_input.request_id:
headers["x-request-id"] = request_func_input.request_id
# Send audio file
def to_bytes(y, sr):
buffer = io.BytesIO()
soundfile.write(buffer, y, sr, format="WAV")
buffer.seek(0)
return buffer
mm_audio = request_func_input.multi_modal_content
if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
raise TypeError("multi_modal_content must be a dict containing 'audio'")
with to_bytes(*mm_audio["audio"]) as f:
form = aiohttp.FormData()
form.add_field("file", f, content_type="audio/wav")
for key, value in payload.items():
form.add_field(key, str(value))
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(
url=api_url, data=form, headers=headers
) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(
timestamp - most_recent_timestamp
)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens"
)
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
output.latency = most_recent_timestamp - st
else:
output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
from modelscope import snapshot_download
from vllm.model_executor.model_loader.weight_utils import get_lock
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with get_lock(pretrained_model_name_or_path):
model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
)
return model_path
return pretrained_model_name_or_path
def get_tokenizer(
pretrained_model_name_or_path: str,
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
**kwargs,
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path
):
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False
if tokenizer_mode == "mistral":
try:
from vllm.tokenizers.mistral import MistralTokenizer
except ImportError as e:
raise ImportError(
"MistralTokenizer requires vllm package.\n"
"Please install it with `pip install vllm` "
"to use mistral tokenizer mode."
) from e
return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
else:
return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust_remote_code,
**kwargs,
)
ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
"vllm": async_request_openai_completions,
"lmdeploy": async_request_openai_completions,
"deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
"openai-audio": async_request_openai_audio,
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
"llama.cpp": async_request_openai_completions,
}
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
This benchmark runs the same workload twice:
1. With VLLM_BATCH_INVARIANT=0 (baseline)
2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
And reports the timing and throughput metrics for comparison.
Environment variables:
VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
Example usage:
# Benchmark qwen3 (default)
python benchmarks/benchmark_batch_invariance.py
# Benchmark deepseek with 8 GPUs
VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
python benchmarks/benchmark_batch_invariance.py
# Quick test with fewer trials
VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
python benchmarks/benchmark_batch_invariance.py
"""
import contextlib
import os
import random
import time
from vllm import LLM, SamplingParams
from vllm.platforms import current_platform
def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
"""Generate a random prompt for benchmarking."""
prompt_templates = [
"Question: What is the capital of France?\nAnswer: The capital of France is",
"Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
"User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
"Once upon a time in a distant galaxy, there lived",
"The old man walked slowly down the street, remembering",
"In the year 2157, humanity finally discovered",
"To implement a binary search tree in Python, first we need to",
"The algorithm works by iterating through the array and",
"Here's how to optimize database queries using indexing:",
"The Renaissance was a period in European history that",
"Climate change is caused by several factors including",
"The human brain contains approximately 86 billion neurons which",
"I've been thinking about getting a new laptop because",
"Yesterday I went to the store and bought",
"My favorite thing about summer is definitely",
]
base_prompt = random.choice(prompt_templates)
if max_words < min_words:
max_words = min_words
target_words = random.randint(min_words, max_words)
if target_words > 50:
padding_text = (
" This is an interesting topic that deserves more explanation. "
* (target_words // 50)
)
base_prompt = base_prompt + padding_text
return base_prompt
def run_benchmark_with_batch_invariant(
model: str,
tp_size: int,
max_batch_size: int,
num_trials: int,
min_prompt: int,
max_prompt: int,
max_tokens: int,
temperature: float,
gpu_mem_util: float,
max_model_len: int,
backend: str,
batch_invariant: bool,
seed: int = 12345,
) -> dict:
"""
Run the benchmark with the specified configuration.
Returns a dict with timing and throughput metrics.
"""
random.seed(seed)
# Set environment variables
if batch_invariant:
os.environ["VLLM_BATCH_INVARIANT"] = "1"
else:
os.environ["VLLM_BATCH_INVARIANT"] = "0"
print(f"\n{'=' * 80}")
print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
print(f" Model: {model}")
print(f" TP Size: {tp_size}")
print(f" Backend: {backend}")
print(f" Max Batch Size: {max_batch_size}")
print(f" Trials: {num_trials}")
print(f" Max Tokens: {max_tokens}")
print(f"{'=' * 80}\n")
sampling = SamplingParams(
temperature=temperature,
top_p=0.95,
max_tokens=max_tokens,
seed=20240919,
)
needle_prompt = "There once was a "
llm = None
try:
# Create LLM engine
start_init = time.perf_counter()
llm = LLM(
model=model,
max_num_seqs=max_batch_size,
gpu_memory_utilization=gpu_mem_util,
max_model_len=max_model_len,
dtype="bfloat16",
tensor_parallel_size=tp_size,
attention_config={"backend": backend},
enable_prefix_caching=False,
)
init_time = time.perf_counter() - start_init
print(f"Engine initialization time: {init_time:.2f}s\n")
# Generate baseline
print("Generating baseline (warmup)...")
baseline_out = llm.generate([needle_prompt], sampling)
assert len(baseline_out) == 1
baseline_text = baseline_out[0].outputs[0].text
print(f"Baseline output: '{baseline_text[:50]}...'\n")
# Run trials and measure timing
trial_times: list[float] = []
total_tokens = 0
total_prompts = 0
for trial in range(num_trials):
# Create a batch
prompts: list[str] = []
batch_size = random.randint(max_batch_size // 2, max_batch_size)
needle_pos = random.randint(0, batch_size - 1)
for i in range(batch_size):
if i == needle_pos:
prompts.append(needle_prompt)
else:
prompts.append(_random_prompt(min_prompt, max_prompt))
# Measure time for this trial
start_time = time.perf_counter()
outputs = llm.generate(prompts, sampling)
trial_time = time.perf_counter() - start_time
trial_times.append(trial_time)
total_prompts += len(prompts)
# Count tokens
for output in outputs:
if output.outputs:
total_tokens += len(output.outputs[0].token_ids)
print(
f"Trial {trial + 1}/{num_trials}: "
f"batch_size={batch_size}, "
f"time={trial_time:.2f}s"
)
# Verify needle output still matches
needle_output = outputs[needle_pos]
assert needle_output.prompt == needle_prompt
# Compute statistics
avg_time = sum(trial_times) / len(trial_times)
min_time = min(trial_times)
max_time = max(trial_times)
throughput = total_tokens / sum(trial_times)
prompts_per_sec = total_prompts / sum(trial_times)
print(f"\n{'=' * 80}")
print("RESULTS:")
print(f" Average time per trial: {avg_time:.2f}s")
print(f" Min time: {min_time:.2f}s")
print(f" Max time: {max_time:.2f}s")
print(f" Total tokens generated: {total_tokens}")
print(f" Total prompts processed: {total_prompts}")
print(f" Throughput: {throughput:.2f} tokens/s")
print(f" Prompts/s: {prompts_per_sec:.2f}")
print(f"{'=' * 80}\n")
return {
"init_time": init_time,
"avg_time": avg_time,
"min_time": min_time,
"max_time": max_time,
"total_tokens": total_tokens,
"total_prompts": total_prompts,
"throughput": throughput,
"prompts_per_sec": prompts_per_sec,
"trial_times": trial_times,
}
finally:
# Cleanup
if llm is not None:
with contextlib.suppress(Exception):
llm.shutdown()
def main():
# Check platform support
if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
print("ERROR: Requires CUDA and >= Hopper (SM90)")
print(f"Current platform: {current_platform.device_type}")
if current_platform.is_cuda():
print(f"Device capability: {current_platform.get_device_capability()}")
return 1
# Read configuration from environment
model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
print("\n" + "=" * 80)
print("VLLM BATCH INVARIANCE BENCHMARK")
print("=" * 80)
print("\nConfiguration:")
print(f" Model: {model}")
print(f" Tensor Parallel Size: {tp_size}")
print(f" Attention Backend: {backend}")
print(f" Max Batch Size: {max_batch_size}")
print(f" Number of Trials: {num_trials}")
print(f" Prompt Length Range: {min_prompt}-{max_prompt} words")
print(f" Max Tokens to Generate: {max_tokens}")
print(f" Temperature: {temperature}")
print(f" GPU Memory Utilization: {gpu_mem_util}")
print(f" Max Model Length: {max_model_len}")
print("=" * 80)
# Run benchmark WITHOUT batch invariance (baseline)
print("\n" + "=" * 80)
print("PHASE 1: Running WITHOUT batch invariance (baseline)")
print("=" * 80)
baseline_results = run_benchmark_with_batch_invariant(
model=model,
tp_size=tp_size,
max_batch_size=max_batch_size,
num_trials=num_trials,
min_prompt=min_prompt,
max_prompt=max_prompt,
max_tokens=max_tokens,
temperature=temperature,
gpu_mem_util=gpu_mem_util,
max_model_len=max_model_len,
backend=backend,
batch_invariant=False,
)
# Run benchmark WITH batch invariance
print("\n" + "=" * 80)
print("PHASE 2: Running WITH batch invariance")
print("=" * 80)
batch_inv_results = run_benchmark_with_batch_invariant(
model=model,
tp_size=tp_size,
max_batch_size=max_batch_size,
num_trials=num_trials,
min_prompt=min_prompt,
max_prompt=max_prompt,
max_tokens=max_tokens,
temperature=temperature,
gpu_mem_util=gpu_mem_util,
max_model_len=max_model_len,
backend=backend,
batch_invariant=True,
)
# Compare results
print("\n" + "=" * 80)
print("COMPARISON: Batch Invariance vs Baseline")
print("=" * 80)
init_overhead_pct = (
(batch_inv_results["init_time"] - baseline_results["init_time"])
/ baseline_results["init_time"]
* 100
)
time_overhead_pct = (
(batch_inv_results["avg_time"] - baseline_results["avg_time"])
/ baseline_results["avg_time"]
* 100
)
throughput_change_pct = (
(batch_inv_results["throughput"] - baseline_results["throughput"])
/ baseline_results["throughput"]
* 100
)
print("\nInitialization Time:")
print(f" Baseline: {baseline_results['init_time']:.2f}s")
print(f" Batch Invariant: {batch_inv_results['init_time']:.2f}s")
print(f" Overhead: {init_overhead_pct:+.2f}%")
print("\nAverage Trial Time:")
print(f" Baseline: {baseline_results['avg_time']:.2f}s")
print(f" Batch Invariant: {batch_inv_results['avg_time']:.2f}s")
print(f" Overhead: {time_overhead_pct:+.2f}%")
print("\nThroughput (tokens/s):")
print(f" Baseline: {baseline_results['throughput']:.2f}")
print(f" Batch Invariant: {batch_inv_results['throughput']:.2f}")
print(f" Change: {throughput_change_pct:+.2f}%")
print("\nPrompts/s:")
print(f" Baseline: {baseline_results['prompts_per_sec']:.2f}")
print(f" Batch Invariant: {batch_inv_results['prompts_per_sec']:.2f}")
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
if time_overhead_pct > 0:
print(
f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
"overhead"
)
else:
print(
f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
"faster (unexpected!)"
)
if abs(throughput_change_pct) < 1.0:
print("Throughput difference is negligible (< 1%)")
elif throughput_change_pct < 0:
print(
f"Throughput decreased by {-throughput_change_pct:.1f}% "
"with batch invariance"
)
else:
print(
f"Throughput increased by {throughput_change_pct:.1f}% "
"with batch invariance (unexpected!)"
)
print("=" * 80 + "\n")
return 0
if __name__ == "__main__":
exit(main())
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
from benchmark_utils import TimeCollector
from tabulate import tabulate
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool
def main(args):
rows = []
for allocate_block in args.allocate_blocks:
# Enforce a GC collect ahead to minimize the impact among runs
gc.collect()
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
get_blocks_times = TimeCollector(TimeCollector.US)
free_blocks_times = TimeCollector(TimeCollector.US)
for _ in range(args.num_iteration):
with get_blocks_times:
blocks = block_pool.get_new_blocks(allocate_block)
with free_blocks_times:
block_pool.free_blocks(blocks)
rows.append(
[get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
+ get_blocks_times.dump_avg_max()
+ free_blocks_times.dump_avg_max()
)
print(
tabulate(
rows,
headers=[
"Iterations",
"Total\nBlocks",
"Allocated\nBlocks",
"Get Blocks\nAvg (us)",
"Get Blocks\nMax (us)",
"Free Blocks\nAvg (us)",
"Free Blocks\nMax (us)",
],
tablefmt="grid",
floatfmt=".3f",
)
)
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of BlockPool for KV Cache."
)
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
parser.add_argument(
"--num-iteration",
type=int,
default=1000,
help="Number of iterations to run to stabilize final data readings",
)
parser.add_argument(
"--allocate-blocks",
type=int,
nargs="*",
default=[10, 50, 100, 500, 1000],
help="Number of blocks to allocate",
)
args = parser.parse_args()
main(args)
if __name__ == "__main__":
invoke_main() # pragma: no cover
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Micro benchmark comparing built-in hash(), SHA-256, and xxHash.
This focuses on a single test payload shaped like the prefix-cache hash input:
(32-byte bytes object, 32-int tuple)
Usage:
python benchmarks/hash_micro_benchmark.py --iterations 20000
"""
from __future__ import annotations
import argparse
import random
import statistics
import time
from collections.abc import Callable, Iterable
from vllm.utils.hashing import sha256, xxhash
def _generate_test_data(seed: int) -> tuple[bytes, tuple[int, ...]]:
"""Generate a deterministic test payload."""
random.seed(seed)
bytes_data = bytes(random.getrandbits(8) for _ in range(32))
int_tuple = tuple(random.randint(1, 1_000_000) for _ in range(32))
return (bytes_data, int_tuple)
def _benchmark_func(func: Callable[[tuple], object], data: tuple, iterations: int):
"""Return (avg_seconds, std_seconds) for hashing `data` `iterations` times."""
times: list[float] = []
# Warm-up to avoid first-run noise.
for _ in range(200):
func(data)
for _ in range(iterations):
start = time.perf_counter()
func(data)
end = time.perf_counter()
times.append(end - start)
avg = statistics.mean(times)
std = statistics.stdev(times) if len(times) > 1 else 0.0
return avg, std
def _run_benchmarks(
benchmarks: Iterable[tuple[str, Callable[[tuple], object]]],
data: tuple,
iterations: int,
):
"""Yield (name, avg, std) for each benchmark, skipping unavailable ones."""
for name, func in benchmarks:
try:
avg, std = _benchmark_func(func, data, iterations)
except ModuleNotFoundError as exc:
print(f"Skipping {name}: {exc}")
continue
yield name, avg, std
def builtin_hash(data: tuple) -> int:
"""Wrapper for Python's built-in hash()."""
return hash(data)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--iterations",
type=int,
default=10_000,
help="Number of measured iterations per hash function.",
)
parser.add_argument(
"--seed", type=int, default=42, help="Random seed for test payload."
)
args = parser.parse_args()
data = _generate_test_data(args.seed)
benchmarks = (
("SHA256 (pickle)", sha256),
("xxHash (pickle)", xxhash),
("built-in hash()", builtin_hash),
)
print("=" * 60)
print("HASH FUNCTION MICRO BENCHMARK")
print("=" * 60)
print("Test data: (32-byte bytes object, 32-int tuple)")
print(f"Iterations: {args.iterations:,}")
print("=" * 60)
results = list(_run_benchmarks(benchmarks, data, args.iterations))
builtin_entry = next((r for r in results if r[0] == "built-in hash()"), None)
print("\nResults:")
for name, avg, std in results:
print(f" {name:16s}: {avg * 1e6:8.2f} ± {std * 1e6:6.2f} μs")
if builtin_entry:
_, builtin_avg, _ = builtin_entry
print("\n" + "=" * 60)
print("SUMMARY (relative to built-in hash())")
print("=" * 60)
for name, avg, _ in results:
if name == "built-in hash()":
continue
speed_ratio = avg / builtin_avg
print(f"• {name} is {speed_ratio:.1f}x slower than built-in hash()")
else:
print("\nBuilt-in hash() result missing; cannot compute speed ratios.")
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import sys
if __name__ == "__main__":
print("""DEPRECATED: This script has been moved to the vLLM CLI.
Please use the following command instead:
vllm bench latency
For help with the new command, run:
vllm bench latency --help
Alternatively, you can run the new command directly with:
python -m vllm.entrypoints.cli.main bench latency --help
""")
sys.exit(1)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment