benchmark.py 37 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""
Universal vLLM Attention Benchmark

Benchmark any attention backend with the extended grammar.
Supports standard attention (Flash/Triton/FlashInfer) and MLA backends.

Examples:
    # Standard attention
    python benchmark.py --backends flash flashinfer --batch-specs "q2k" "8q1s1k"

    # MLA backends
    python benchmark.py --backends cutlass_mla flashinfer_mla --batch-specs "64q1s1k"

    # Parameter sweep (CLI)
    python benchmark.py --backend cutlass_mla \
                        --batch-specs "64q1s1k" \
                        --sweep-param num_kv_splits \
                        --sweep-values 1 4 8 16

    # Parameter sweep (YAML config - recommended)
    python benchmark.py --config configs/cutlass_numsplits.yaml
"""

import argparse
import sys
from dataclasses import replace
from pathlib import Path

import yaml
from rich.console import Console
from tqdm import tqdm

sys.path.insert(0, str(Path(__file__).parent.parent.parent))

from batch_spec import parse_batch_spec
from common import (
    BenchmarkConfig,
    BenchmarkResult,
    ModelParameterSweep,
    ParameterSweep,
    ResultsFormatter,
46
    batch_spec_sort_key,
47
48
49
    is_mla_backend,
)

50
51
from vllm.v1.worker.workspace import init_workspace_manager

52
53
54
55
56
57
58
59
60
61
62
63

def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
    from runner import run_attention_benchmark

    return run_attention_benchmark(config)


def run_mla_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """Run MLA benchmark with appropriate backend."""
    from mla_runner import run_mla_benchmark as run_mla

64
65
66
    return run_mla(
        config.backend, config, prefill_backend=config.prefill_backend, **kwargs
    )
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225


def run_benchmark(config: BenchmarkConfig, **kwargs) -> BenchmarkResult:
    """
    Run a single benchmark with proper backend selection.

    Args:
        config: BenchmarkConfig with backend, batch_spec, and model params
        **kwargs: Additional arguments passed to MLA benchmarks

    Returns:
        BenchmarkResult (may have error field set on failure)
    """
    try:
        if is_mla_backend(config.backend):
            return run_mla_benchmark(config, **kwargs)
        else:
            return run_standard_attention_benchmark(config)
    except Exception as e:
        return BenchmarkResult(
            config=config,
            mean_time=float("inf"),
            std_time=0,
            min_time=float("inf"),
            max_time=float("inf"),
            error=str(e),
        )


def run_model_parameter_sweep(
    backends: list[str],
    batch_specs: list[str],
    base_config_args: dict,
    sweep: ModelParameterSweep,
    console: Console,
) -> list[BenchmarkResult]:
    """
    Run model parameter sweep for given backends and batch specs.

    Args:
        backends: List of backend names
        batch_specs: List of batch specifications
        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
        sweep: ModelParameterSweep configuration
        console: Rich console for output

    Returns:
        List of BenchmarkResult objects
    """
    all_results = []

    console.print(
        f"[yellow]Model sweep mode: testing {sweep.param_name} = {sweep.values}[/]"
    )

    total = len(backends) * len(batch_specs) * len(sweep.values)

    with tqdm(total=total, desc="Benchmarking") as pbar:
        for backend in backends:
            for spec in batch_specs:
                for value in sweep.values:
                    # Create config with modified model parameter
                    config_args = base_config_args.copy()
                    config_args[sweep.param_name] = value

                    # Create config with original backend for running
                    clean_config = BenchmarkConfig(
                        backend=backend, batch_spec=spec, **config_args
                    )

                    # Run benchmark
                    result = run_benchmark(clean_config)

                    # Replace backend with labeled version for display
                    backend_label = sweep.get_label(backend, value)
                    labeled_config = replace(result.config, backend=backend_label)
                    result = replace(result, config=labeled_config)
                    all_results.append(result)

                    if not result.success:
                        console.print(
                            f"[red]Error {backend} {spec} {sweep.param_name}="
                            f"{value}: {result.error}[/]"
                        )

                    pbar.update(1)

    # Display sweep results - create separate table for each parameter value
    console.print("\n[bold green]Model Parameter Sweep Results:[/]")
    formatter = ResultsFormatter(console)

    # Group results by parameter value and extract backend mapping
    by_param_value = {}
    backend_mapping = {}  # Maps labeled backend -> original backend

    for r in all_results:
        # Extract original backend and param value from labeled backend
        # The label format is: {backend}_{param_name}_{value}
        # We need to reverse engineer this
        labeled_backend = r.config.backend

        # Try each backend to find which one this result belongs to
        for backend in backends:
            for value in sweep.values:
                expected_label = sweep.get_label(backend, value)
                if labeled_backend == expected_label:
                    backend_mapping[labeled_backend] = backend
                    param_value = str(value)

                    if param_value not in by_param_value:
                        by_param_value[param_value] = []
                    by_param_value[param_value].append(r)
                    break

    # Create a table for each parameter value
    sorted_param_values = sorted(
        by_param_value.keys(), key=lambda x: int(x) if x.isdigit() else x
    )

    for param_value in sorted_param_values:
        console.print(f"\n[bold cyan]{sweep.param_name} = {param_value}[/]")
        param_results = by_param_value[param_value]

        # Create modified results with original backend names
        modified_results = []
        for r in param_results:
            # Get the original backend name from our mapping
            original_backend = backend_mapping[r.config.backend]
            modified_config = replace(r.config, backend=original_backend)
            modified_result = replace(r, config=modified_config)
            modified_results.append(modified_result)

        # Print table with original backend names
        formatter.print_table(modified_results, backends, compare_to_fastest=True)

    # Show optimal backend for each (param_value, batch_spec) combination
    console.print(
        f"\n[bold cyan]Optimal backend for each ({sweep.param_name}, batch_spec):[/]"
    )

    # Group by (param_value, batch_spec)
    by_param_and_spec = {}
    for r in all_results:
        if r.success:
            # Find which (backend, value) this result corresponds to
            labeled_backend = r.config.backend
            for backend in backends:
                for value in sweep.values:
                    expected_label = sweep.get_label(backend, value)
                    if labeled_backend == expected_label:
                        param_value = str(value)
                        spec = r.config.batch_spec
                        key = (param_value, spec)

                        if key not in by_param_and_spec:
                            by_param_and_spec[key] = []
                        by_param_and_spec[key].append(r)
                        break

226
    # Sort by param value then spec (batch_size, q_len, kv_len)
227
228
    sorted_keys = sorted(
        by_param_and_spec.keys(),
229
230
231
232
        key=lambda x: (
            int(x[0]) if x[0].isdigit() else x[0],
            batch_spec_sort_key(x[1]),
        ),
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
    )

    current_param_value = None
    for param_value, spec in sorted_keys:
        # Print header when param value changes
        if param_value != current_param_value:
            console.print(f"\n  [bold]{sweep.param_name}={param_value}:[/]")
            current_param_value = param_value

        results = by_param_and_spec[(param_value, spec)]
        best = min(results, key=lambda r: r.mean_time)

        # Extract original backend name using the mapping
        backend_name = backend_mapping[best.config.backend]

        # Show all backends' times for comparison
        times_str = " | ".join(
            [
                f"{backend_mapping[r.config.backend]}: {r.mean_time:.6f}s"
                for r in sorted(results, key=lambda r: r.mean_time)
            ]
        )

        console.print(
            f"    {spec:12s} -> [bold green]{backend_name:15s}[/] ({times_str})"
        )

    return all_results


def run_parameter_sweep(
    backends: list[str],
    batch_specs: list[str],
    base_config_args: dict,
    sweep: ParameterSweep,
    console: Console,
) -> list[BenchmarkResult]:
    """
    Run parameter sweep for given backends and batch specs.

    Args:
        backends: List of backend names
        batch_specs: List of batch specifications
        base_config_args: Base configuration arguments (num_layers, head_dim, etc.)
        sweep: ParameterSweep configuration
        console: Rich console for output

    Returns:
        List of BenchmarkResult objects
    """
    all_results = []

    # Build list of values to sweep (including auto if requested)
    sweep_values = list(sweep.values)
    if sweep.include_auto:
        sweep_values.append("auto")

    console.print(f"[yellow]Sweep mode: testing {sweep.param_name} = {sweep_values}[/]")

    total = len(backends) * len(batch_specs) * len(sweep_values)

    with tqdm(total=total, desc="Benchmarking") as pbar:
        for backend in backends:
            for spec in batch_specs:
                for value in sweep_values:
                    # Create config with original backend for running
                    config = BenchmarkConfig(
                        backend=backend, batch_spec=spec, **base_config_args
                    )

                    # Prepare kwargs for benchmark runner
                    kwargs = {}
                    if value != "auto":
                        kwargs[sweep.param_name] = value

                    # Run benchmark
                    result = run_benchmark(config, **kwargs)

                    # Replace backend with labeled version for display
                    backend_label = sweep.get_label(backend, value)
                    labeled_config = replace(result.config, backend=backend_label)
                    result = replace(result, config=labeled_config)
                    all_results.append(result)

                    if not result.success:
                        console.print(
                            f"[red]Error {backend} {spec} {sweep.param_name}="
                            f"{value}: {result.error}[/]"
                        )

                    pbar.update(1)

    # Display sweep results
    console.print("\n[bold green]Sweep Results:[/]")
    backend_labels = [sweep.get_label(b, v) for b in backends for v in sweep_values]
    formatter = ResultsFormatter(console)
    formatter.print_table(all_results, backend_labels)

    # Show optimal values
    console.print(f"\n[bold cyan]Optimal {sweep.param_name} per batch spec:[/]")
    by_spec = {}
    for r in all_results:
        if r.success:
            spec = r.config.batch_spec
            if spec not in by_spec:
                by_spec[spec] = []
            by_spec[spec].append(r)

341
    for spec in sorted(by_spec.keys(), key=batch_spec_sort_key):
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
        results = by_spec[spec]
        best = min(results, key=lambda r: r.mean_time)
        console.print(
            f"  {spec}: [bold green]{best.config.backend}[/] ({best.mean_time:.6f}s)"
        )

    return all_results


def load_config_from_yaml(config_path: str) -> dict:
    """Load configuration from YAML file."""
    with open(config_path) as f:
        return yaml.safe_load(f)


def generate_batch_specs_from_ranges(ranges: list[dict]) -> list[str]:
    """
    Generate batch specs from range specifications.

    Args:
        ranges: List of range specifications, each containing:
            - template: Batch spec template (e.g., "q{q_len}kv1k")
            - q_len: Dict with start, stop, step, end_inclusive (optional)
            - Other parameters can also be ranges

    Returns:
        List of generated batch spec strings

    Example:
        ranges = [
            {
                "template": "q{q_len}kv1k",
                "q_len": {
                    "start": 1,
                    "stop": 16,
                    "step": 1,
                    "end_inclusive": true  # Optional, defaults to true
                }
            }
        ]
        Returns: ["q1kv1k", "q2kv1k", ..., "q16kv1k"]
    """
    all_specs = []

    for range_spec in ranges:
        template = range_spec.get("template")
        if not template:
            raise ValueError("Range specification must include 'template'")

        # Extract all range parameters from the spec
        range_params = {}
        for key, value in range_spec.items():
            if key == "template":
                continue
            if isinstance(value, dict) and "start" in value:
                # This is a range specification
                start = value["start"]
                stop = value["stop"]
                step = value.get("step", 1)
                # Check if end should be inclusive (default: True)
                end_inclusive = value.get("end_inclusive", True)

                # Adjust stop based on end_inclusive
                if end_inclusive:
                    range_params[key] = list(range(start, stop + 1, step))
                else:
                    range_params[key] = list(range(start, stop, step))
            else:
                # This is a fixed value
                range_params[key] = [value]

        # Generate all combinations (Cartesian product)
        if range_params:
            import itertools

            param_names = list(range_params.keys())
            param_values = [range_params[name] for name in param_names]

            for values in itertools.product(*param_values):
                params = dict(zip(param_names, values))
                spec = template.format(**params)
                all_specs.append(spec)
        else:
            # No parameters, just use template as-is
            all_specs.append(template)

    return all_specs


def main():
    parser = argparse.ArgumentParser(
        description="Universal vLLM attention benchmark",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )

    # Config file
    parser.add_argument(
        "--config",
        help="Path to YAML config file (overrides other args)",
    )

    # Backend selection
    parser.add_argument(
        "--backends",
447
        "--decode-backends",
448
        nargs="+",
449
        help="Decode backends to benchmark (flash, triton, flashinfer, cutlass_mla, "
450
451
452
453
454
455
        "flashinfer_mla, flashattn_mla, flashmla)",
    )
    parser.add_argument(
        "--backend",
        help="Single backend (alternative to --backends)",
    )
456
457
458
459
460
461
    parser.add_argument(
        "--prefill-backends",
        nargs="+",
        help="Prefill backends to compare (fa2, fa3, fa4). "
        "Uses the first decode backend for impl construction.",
    )
462
463
464
465
466

    # Batch specifications
    parser.add_argument(
        "--batch-specs",
        nargs="+",
467
        default=None,
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
        help="Batch specifications using extended grammar",
    )

    # Model config
    parser.add_argument("--num-layers", type=int, default=10, help="Number of layers")
    parser.add_argument("--head-dim", type=int, default=128, help="Head dimension")
    parser.add_argument("--num-q-heads", type=int, default=32, help="Query heads")
    parser.add_argument("--num-kv-heads", type=int, default=8, help="KV heads")
    parser.add_argument("--block-size", type=int, default=16, help="Block size")

    # Benchmark settings
    parser.add_argument("--device", default="cuda:0", help="Device")
    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
    parser.add_argument(
        "--kv-cache-dtype",
        default="auto",
        choices=["auto", "fp8"],
        help="KV cache dtype: auto or fp8",
    )
    parser.add_argument(
        "--cuda-graphs",
        action=argparse.BooleanOptionalAction,
        default=True,
        help=(
            "Launch kernels with CUDA graphs to eliminate CPU overhead"
            "in measurements (default: True)"
        ),
    )
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528

    # Parameter sweep (use YAML config for advanced sweeps)
    parser.add_argument(
        "--sweep-param",
        help="Parameter name to sweep (e.g., num_kv_splits, reorder_batch_threshold)",
    )
    parser.add_argument(
        "--sweep-values",
        type=int,
        nargs="+",
        help="Values to sweep for the parameter",
    )

    # Output
    parser.add_argument("--output-csv", help="Save to CSV")
    parser.add_argument("--output-json", help="Save to JSON")

    args = parser.parse_args()

    console = Console()
    console.print("[bold cyan]vLLM Attention Benchmark[/]")

    # Load config from YAML if provided
    if args.config:
        console.print(f"[yellow]Loading config from: {args.config}[/]")
        yaml_config = load_config_from_yaml(args.config)

        # Show description if available
        if "description" in yaml_config:
            console.print(f"[dim]{yaml_config['description']}[/]")

529
530
        # Override args with YAML values, but CLI args take precedence
        # Check if CLI provided backends (they would be non-None and not default)
531
        cli_backends_provided = args.backend is not None or args.backends is not None
532
533
534
535
536
537
538
539
540

        # Backend(s) - only use YAML if CLI didn't specify
        if not cli_backends_provided:
            if "backend" in yaml_config:
                args.backend = yaml_config["backend"]
                args.backends = None
            elif "backends" in yaml_config:
                args.backends = yaml_config["backends"]
                args.backend = None
541
542
543
544
545
546
            elif "decode_backends" in yaml_config:
                args.backends = yaml_config["decode_backends"]
                args.backend = None

        # Prefill backends (e.g., ["fa3", "fa4"])
        args.prefill_backends = yaml_config.get("prefill_backends", None)
547
548

        # Check for special modes
549
        args.mode = yaml_config.get("mode", None)
550
551
552

        # Batch specs and sizes
        # Support both explicit batch_specs and generated batch_spec_ranges
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
        # CLI --batch-specs takes precedence over YAML when provided.
        cli_batch_specs_provided = args.batch_specs is not None
        if not cli_batch_specs_provided:
            if "batch_spec_ranges" in yaml_config:
                # Generate batch specs from ranges
                generated_specs = generate_batch_specs_from_ranges(
                    yaml_config["batch_spec_ranges"]
                )
                # Combine with any explicit batch_specs
                if "batch_specs" in yaml_config:
                    args.batch_specs = yaml_config["batch_specs"] + generated_specs
                else:
                    args.batch_specs = generated_specs
                console.print(
                    f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
                )
            elif "batch_specs" in yaml_config:
                args.batch_specs = yaml_config["batch_specs"]
571

572
        args.batch_sizes = yaml_config.get("batch_sizes", None)
573
574
575
576
577
578
579
580
581
582

        # Model config
        if "model" in yaml_config:
            model = yaml_config["model"]
            args.num_layers = model.get("num_layers", args.num_layers)
            args.head_dim = model.get("head_dim", args.head_dim)
            args.num_q_heads = model.get("num_q_heads", args.num_q_heads)
            args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads)
            args.block_size = model.get("block_size", args.block_size)

583
584
585
586
587
588
589
590
591
        # Benchmark settings (top-level keys)
        if "device" in yaml_config:
            args.device = yaml_config["device"]
        if "repeats" in yaml_config:
            args.repeats = yaml_config["repeats"]
        if "warmup_iters" in yaml_config:
            args.warmup_iters = yaml_config["warmup_iters"]
        if "profile_memory" in yaml_config:
            args.profile_memory = yaml_config["profile_memory"]
592
593
594
595
        if "kv_cache_dtype" in yaml_config:
            args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
        if "cuda_graphs" in yaml_config:
            args.cuda_graphs = yaml_config["cuda_graphs"]
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648

        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
            sweep_config = yaml_config["parameter_sweep"]
            args.parameter_sweep = ParameterSweep(
                param_name=sweep_config["param_name"],
                values=sweep_config["values"],
                include_auto=sweep_config.get("include_auto", False),
                label_format=sweep_config.get(
                    "label_format", "{backend}_{param_name}_{value}"
                ),
            )
        else:
            args.parameter_sweep = None

        # Model parameter sweep configuration
        if "model_parameter_sweep" in yaml_config:
            sweep_config = yaml_config["model_parameter_sweep"]
            args.model_parameter_sweep = ModelParameterSweep(
                param_name=sweep_config["param_name"],
                values=sweep_config["values"],
                label_format=sweep_config.get(
                    "label_format", "{backend}_{param_name}_{value}"
                ),
            )
        else:
            args.model_parameter_sweep = None

        # Output
        if "output" in yaml_config:
            output = yaml_config["output"]
            if "csv" in output and not args.output_csv:
                args.output_csv = output["csv"]
            if "json" in output and not args.output_json:
                args.output_json = output["json"]

        console.print()

    # Handle CLI-based parameter sweep (if not from YAML)
    if (
        (not hasattr(args, "parameter_sweep") or args.parameter_sweep is None)
        and args.sweep_param
        and args.sweep_values
    ):
        args.parameter_sweep = ParameterSweep(
            param_name=args.sweep_param,
            values=args.sweep_values,
            include_auto=False,
            label_format="{backend}_{param_name}_{value}",
        )

    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
649
    prefill_backends = getattr(args, "prefill_backends", None)
650
651
    if not args.batch_specs:
        args.batch_specs = ["q2k", "8q1s1k"]
652
    console.print(f"Backends: {', '.join(backends)}")
653
654
    if prefill_backends:
        console.print(f"Prefill backends: {', '.join(prefill_backends)}")
655
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
656
657
    console.print(f"KV cache dtype: {args.kv_cache_dtype}")
    console.print(f"CUDA graphs: {args.cuda_graphs}")
658
659
    console.print()

660
661
    init_workspace_manager(args.device)

662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
    # Run benchmarks
    all_results = []

    # Handle special mode: decode_vs_prefill comparison
    if hasattr(args, "mode") and args.mode == "decode_vs_prefill":
        console.print("[yellow]Mode: Decode vs Prefill pipeline comparison[/]")
        console.print(
            "[dim]For each query length, testing both decode and prefill pipelines[/]"
        )
        console.print("[dim]Using batched execution for optimal performance[/]")

        # Extract batch sizes from config
        batch_sizes = getattr(args, "batch_sizes", [1])
        backend = backends[0]  # Use first backend (should only be one)

        # Calculate total benchmarks
        total = len(batch_sizes)

        with tqdm(total=total, desc="Benchmarking") as pbar:
            for batch_size in batch_sizes:
                # Prepare all configs for this batch size
                configs_with_thresholds = []

                for spec in args.batch_specs:
                    # Parse the batch spec to get query length
                    requests = parse_batch_spec(spec)
                    if not requests:
                        console.print(
                            f"[red]Error: Could not parse batch spec '{spec}'[/]"
                        )
                        continue

                    # Get query length from first request
                    query_length = requests[0].q_len

                    # Create batch spec for this batch size
                    # For batch_size > 1, we need to prepend the count
                    batch_spec = f"{batch_size}{spec}" if batch_size > 1 else spec

                    # Create base config (without backend name)
                    base_config = BenchmarkConfig(
                        backend=backend,  # Will be overridden later
                        batch_spec=batch_spec,
                        num_layers=args.num_layers,
                        head_dim=args.head_dim,
                        num_q_heads=args.num_q_heads,
                        num_kv_heads=args.num_kv_heads,
                        block_size=args.block_size,
                        device=args.device,
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
714
715
                        kv_cache_dtype=args.kv_cache_dtype,
                        use_cuda_graphs=args.cuda_graphs,
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
                    )

                    # Add decode pipeline config
                    decode_threshold = query_length
                    config_decode = replace(
                        base_config,
                        backend=f"{backend}_decode_qlen{query_length}_bs{batch_size}",
                    )
                    configs_with_thresholds.append((config_decode, decode_threshold))

                    # Add prefill pipeline config if query_length > 1
                    if query_length > 1:
                        prefill_threshold = query_length - 1
                        config_prefill = replace(
                            base_config,
                            backend=f"{backend}_prefill_qlen{query_length}"
                            f"_bs{batch_size}",
                        )
                        configs_with_thresholds.append(
                            (config_prefill, prefill_threshold)
                        )

                # Run all benchmarks for this batch size in one go (batched mode)
                try:
                    from mla_runner import run_mla_benchmark as run_mla

                    # Use batched API: pass list of (config, threshold) tuples
                    timing_results = run_mla(backend, configs_with_thresholds)

                    # Create BenchmarkResult objects from timing results
                    for (config, _), timing in zip(
                        configs_with_thresholds, timing_results
                    ):
                        result = BenchmarkResult(
                            config=config,
                            mean_time=timing["mean"],
                            std_time=timing["std"],
                            min_time=timing["min"],
                            max_time=timing["max"],
                            throughput_tokens_per_sec=timing.get("throughput", None),
                        )
                        all_results.append(result)

                except Exception as e:
                    import traceback

                    console.print(
                        f"[red]Error running batched benchmarks for "
                        f"batch_size={batch_size}: {e}[/]"
                    )
                    console.print("[red]Traceback:[/]")
                    traceback.print_exc()
                    # Add error results for all configs
                    for config, _ in configs_with_thresholds:
                        result = BenchmarkResult(
                            config=config,
                            mean_time=float("inf"),
                            std_time=0,
                            min_time=float("inf"),
                            max_time=float("inf"),
                            error=str(e),
                        )
                        all_results.append(result)

                pbar.update(1)

        # Display decode vs prefill results
        console.print("\n[bold green]Decode vs Prefill Results:[/]")

        # Group by batch size
        by_batch_size = {}
        for r in all_results:
            if r.success:
                # Extract batch size from backend name
                parts = r.config.backend.split("_")
                bs_part = [p for p in parts if p.startswith("bs")]
                if bs_part:
                    bs = int(bs_part[0][2:])
                    if bs not in by_batch_size:
                        by_batch_size[bs] = []
                    by_batch_size[bs].append(r)

        # For each batch size, analyze crossover point
        for bs in sorted(by_batch_size.keys()):
            console.print(f"\n[bold cyan]Batch size: {bs}[/]")
            results = by_batch_size[bs]

            # Group by query length
            by_qlen = {}
            for r in results:
                parts = r.config.backend.split("_")
                qlen_part = [p for p in parts if p.startswith("qlen")]
                if qlen_part:
                    qlen = int(qlen_part[0][4:])
                    if qlen not in by_qlen:
                        by_qlen[qlen] = {}

                    pipeline = "decode" if "decode" in r.config.backend else "prefill"
                    by_qlen[qlen][pipeline] = r

            # Find crossover point
            last_decode_faster = None
            for qlen in sorted(by_qlen.keys()):
                pipelines = by_qlen[qlen]
                if "decode" in pipelines and "prefill" in pipelines:
                    decode_time = pipelines["decode"].mean_time
                    prefill_time = pipelines["prefill"].mean_time
                    faster = "decode" if decode_time < prefill_time else "prefill"

                    speedup = (
                        prefill_time / decode_time
                        if decode_time < prefill_time
                        else decode_time / prefill_time
                    )

                    console.print(
                        f"  qlen={qlen:3d}: decode={decode_time:.6f}s, "
                        f"prefill={prefill_time:.6f}s -> "
                        f"[bold]{faster}[/] ({speedup:.2f}x)"
                    )

                    if faster == "decode":
                        last_decode_faster = qlen

            if last_decode_faster is not None:
                optimal_threshold = last_decode_faster
                console.print(
                    f"\n  [bold green]Optimal threshold for batch_size={bs}: "
                    f"{optimal_threshold}[/]"
                )
                console.print(
                    f"  [dim](Use decode pipeline for query_length <= "
                    f"{optimal_threshold})[/]"
                )
            else:
                console.print(
                    f"\n  [yellow]Prefill always faster for batch_size={bs}[/]"
                )

    # Handle model parameter sweep mode
    elif hasattr(args, "model_parameter_sweep") and args.model_parameter_sweep:
        # Model parameter sweep
        base_config_args = {
            "num_layers": args.num_layers,
            "head_dim": args.head_dim,
            "num_q_heads": args.num_q_heads,
            "num_kv_heads": args.num_kv_heads,
            "block_size": args.block_size,
            "device": args.device,
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
868
869
            "kv_cache_dtype": args.kv_cache_dtype,
            "use_cuda_graphs": args.cuda_graphs,
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
        }
        all_results = run_model_parameter_sweep(
            backends,
            args.batch_specs,
            base_config_args,
            args.model_parameter_sweep,
            console,
        )

    # Handle parameter sweep mode (unified)
    elif hasattr(args, "parameter_sweep") and args.parameter_sweep:
        # Unified parameter sweep
        base_config_args = {
            "num_layers": args.num_layers,
            "head_dim": args.head_dim,
            "num_q_heads": args.num_q_heads,
            "num_kv_heads": args.num_kv_heads,
            "block_size": args.block_size,
            "device": args.device,
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
892
893
            "kv_cache_dtype": args.kv_cache_dtype,
            "use_cuda_graphs": args.cuda_graphs,
894
895
896
897
898
899
900
        }
        all_results = run_parameter_sweep(
            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
        )

    else:
        # Normal mode: compare backends
901
902
        decode_results = []
        prefill_results = []
903

904
905
906
907
        # Run decode backend comparison
        if not prefill_backends:
            # No prefill backends specified: compare decode backends as before
            total = len(backends) * len(args.batch_specs)
908

909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
            with tqdm(total=total, desc="Benchmarking") as pbar:
                for spec in args.batch_specs:
                    for backend in backends:
                        config = BenchmarkConfig(
                            backend=backend,
                            batch_spec=spec,
                            num_layers=args.num_layers,
                            head_dim=args.head_dim,
                            num_q_heads=args.num_q_heads,
                            num_kv_heads=args.num_kv_heads,
                            block_size=args.block_size,
                            device=args.device,
                            repeats=args.repeats,
                            warmup_iters=args.warmup_iters,
                            profile_memory=args.profile_memory,
924
925
                            kv_cache_dtype=args.kv_cache_dtype,
                            use_cuda_graphs=args.cuda_graphs,
926
                        )
927

928
929
                        result = run_benchmark(config)
                        decode_results.append(result)
930

931
932
933
934
                        if not result.success:
                            console.print(
                                f"[red]Error {backend} {spec}: {result.error}[/]"
                            )
935

936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
                        pbar.update(1)

            console.print("\n[bold green]Results:[/]")
            formatter = ResultsFormatter(console)
            formatter.print_table(decode_results, backends)

        # Run prefill backend comparison
        if prefill_backends:
            # Use first decode backend for impl construction
            decode_backend = backends[0]
            total = len(prefill_backends) * len(args.batch_specs)

            console.print(
                f"[yellow]Prefill comparison mode: "
                f"using {decode_backend} for decode impl[/]"
            )

            with tqdm(total=total, desc="Prefill benchmarking") as pbar:
                for spec in args.batch_specs:
                    for pb in prefill_backends:
                        config = BenchmarkConfig(
                            backend=decode_backend,
                            batch_spec=spec,
                            num_layers=args.num_layers,
                            head_dim=args.head_dim,
                            num_q_heads=args.num_q_heads,
                            num_kv_heads=args.num_kv_heads,
                            block_size=args.block_size,
                            device=args.device,
                            repeats=args.repeats,
                            warmup_iters=args.warmup_iters,
                            profile_memory=args.profile_memory,
                            prefill_backend=pb,
                        )

                        result = run_benchmark(config)

                        # Label result with prefill backend name for display
                        labeled_config = replace(result.config, backend=pb)
                        result = replace(result, config=labeled_config)
                        prefill_results.append(result)

                        if not result.success:
                            console.print(f"[red]Error {pb} {spec}: {result.error}[/]")

                        pbar.update(1)

            console.print("\n[bold green]Prefill Backend Results:[/]")
            formatter = ResultsFormatter(console)
            formatter.print_table(
                prefill_results, prefill_backends, compare_to_fastest=True
            )

        all_results = decode_results + prefill_results
990
991
992
993
994
995
996
997
998
999
1000
1001

    # Save results
    if all_results:
        formatter = ResultsFormatter(console)
        if args.output_csv:
            formatter.save_csv(all_results, args.output_csv)
        if args.output_json:
            formatter.save_json(all_results, args.output_json)


if __name__ == "__main__":
    main()