"fern/pages/kubernetes/README.md" did not exist on "3057af00b6ceb41e8179c177d5446917a102bdba"
mla_runner.py 37.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""
MLA benchmark runner - shared utilities for MLA benchmarks.

This module provides helpers for running MLA backends without
needing full VllmConfig integration.
"""

import numpy as np
import torch
from batch_spec import parse_batch_spec
from common import (
    BenchmarkResult,
    MockHfConfig,
17
    MockIndexer,
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    MockKVBProj,
    MockLayer,
    setup_mla_dims,
)

from vllm.config import (
    CacheConfig,
    CompilationConfig,
    ModelConfig,
    ParallelConfig,
    SchedulerConfig,
    VllmConfig,
    set_current_vllm_config,
)

# ============================================================================
# VllmConfig Creation
# ============================================================================


def _add_mock_methods_to_model_config(model_config: ModelConfig) -> None:
    """
    Add mock methods for layer-specific queries to ModelConfig.

    These methods are needed by metadata builders but aren't normally
    present on ModelConfig when used in benchmark contexts.
    """
    import types

    model_config.get_num_layers = types.MethodType(lambda self: 1, model_config)
    model_config.get_sliding_window_for_layer = types.MethodType(
        lambda self, _i: None, model_config
    )
    model_config.get_logits_soft_cap_for_layer = types.MethodType(
        lambda self, _i: None, model_config
    )
    model_config.get_sm_scale_for_layer = types.MethodType(
        lambda self, _i: 1.0 / model_config.get_head_size() ** 0.5, model_config
    )


def create_minimal_vllm_config(
    model_name: str = "deepseek-v3",
    block_size: int = 128,
    max_num_seqs: int = 256,
63
    max_num_batched_tokens: int = 8192,
64
    mla_dims: dict | None = None,
65
    index_topk: int | None = None,
66
    prefill_backend: str | None = None,
67
    kv_cache_dtype: str = "auto",
68
69
70
71
72
73
74
75
76
77
78
) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.

    Args:
        model_name: Model name (deepseek-v2, deepseek-v3, etc.) - used if mla_dims not
                    provided
        block_size: KV cache block size
        max_num_seqs: Maximum number of sequences
        mla_dims: Optional custom MLA dimensions dict. If not provided, uses
                  setup_mla_dims(model_name)
79
80
        index_topk: Optional topk value for sparse MLA backends. If provided,
                    the config will include index_topk for sparse attention.
81
82
83
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4", "flashinfer",
                        "cudnn", "trtllm"). Configures the attention config to
                        force the specified prefill backend.
84
85
86
87
88
89
90
91
92

    Returns:
        VllmConfig for benchmarking
    """
    # Get MLA dimensions - use provided or load from model name
    if mla_dims is None:
        mla_dims = setup_mla_dims(model_name)

    # Create mock HF config first (avoids downloading from HuggingFace)
93
    mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk)
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

    # Create a temporary minimal config.json to avoid HF downloads
    # This ensures consistent ModelConfig construction without network access
    import json
    import os
    import shutil
    import tempfile

    minimal_config = {
        "architectures": ["DeepseekV2ForCausalLM"],
        "model_type": "deepseek_v2",
        "num_attention_heads": mla_dims["num_q_heads"],
        "num_key_value_heads": mla_dims["num_kv_heads"],
        "hidden_size": mla_dims["head_dim"] * mla_dims["num_q_heads"],
        "torch_dtype": "bfloat16",
        "max_position_embeddings": 163840,  # DeepSeek V3 default
        "rope_theta": 10000.0,
        "vocab_size": 128256,
    }

    # Create temporary directory with config.json
    temp_dir = tempfile.mkdtemp(prefix="vllm_bench_")
    config_path = os.path.join(temp_dir, "config.json")
    with open(config_path, "w") as f:
        json.dump(minimal_config, f)

    try:
        # Create model config using local path - no HF downloads
        model_config = ModelConfig(
            model=temp_dir,  # Use local temp directory
            tokenizer=None,
            tokenizer_mode="auto",
            trust_remote_code=True,
            dtype="bfloat16",
            seed=0,
            max_model_len=32768,
            quantization=None,
            enforce_eager=False,
            max_logprobs=20,
            disable_sliding_window=False,
            skip_tokenizer_init=True,
            served_model_name=None,
            limit_mm_per_prompt=None,
            config_format="auto",
        )
    finally:
        # Clean up temporary directory
        shutil.rmtree(temp_dir, ignore_errors=True)

    # Override with our mock config
    model_config.hf_config = mock_hf_config
    model_config.hf_text_config = mock_hf_config

    # Add mock methods for layer-specific queries
    _add_mock_methods_to_model_config(model_config)

    # Create sub-configs
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
154
        cache_dtype=kv_cache_dtype,
155
156
157
158
159
        enable_prefix_caching=False,
    )

    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
160
        max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
161
162
163
164
165
166
167
168
169
170
171
        max_model_len=32768,
        is_encoder_decoder=False,
        enable_chunked_prefill=True,
    )

    parallel_config = ParallelConfig(
        tensor_parallel_size=1,
    )

    compilation_config = CompilationConfig()

172
    vllm_config = VllmConfig(
173
174
175
176
177
178
179
        model_config=model_config,
        cache_config=cache_config,
        parallel_config=parallel_config,
        scheduler_config=scheduler_config,
        compilation_config=compilation_config,
    )

180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
    if prefill_backend is not None:
        prefill_cfg = get_prefill_backend_config(prefill_backend)
        if prefill_cfg["flash_attn_version"] is not None:
            vllm_config.attention_config.flash_attn_version = prefill_cfg[
                "flash_attn_version"
            ]
        vllm_config.attention_config.disable_flashinfer_prefill = prefill_cfg[
            "disable_flashinfer_prefill"
        ]
        vllm_config.attention_config.use_cudnn_prefill = prefill_cfg[
            "use_cudnn_prefill"
        ]
        vllm_config.attention_config.use_trtllm_ragged_deepseek_prefill = prefill_cfg[
            "use_trtllm_ragged_deepseek_prefill"
        ]

    return vllm_config

198
199

# ============================================================================
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# Prefill Backend Configuration
# ============================================================================

# Maps prefill backend names to attention config overrides.
# FA backends set flash_attn_version and disable non-FA paths.
# Non-FA backends enable their specific path and disable others.
_PREFILL_BACKEND_CONFIG: dict[str, dict] = {
    "fa2": {
        "flash_attn_version": 2,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "fa3": {
        "flash_attn_version": 3,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "fa4": {
        "flash_attn_version": 4,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "flashinfer": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": False,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "cudnn": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": True,
        "use_trtllm_ragged_deepseek_prefill": False,
    },
    "trtllm": {
        "flash_attn_version": None,
        "disable_flashinfer_prefill": True,
        "use_cudnn_prefill": False,
        "use_trtllm_ragged_deepseek_prefill": True,
    },
}


def get_prefill_backend_config(prefill_backend: str) -> dict:
    """Get attention config overrides for a prefill backend."""
    if prefill_backend not in _PREFILL_BACKEND_CONFIG:
        raise ValueError(
            f"Unknown prefill backend: {prefill_backend!r}. "
            f"Available: {list(_PREFILL_BACKEND_CONFIG.keys())}"
        )
    return _PREFILL_BACKEND_CONFIG[prefill_backend]


# ============================================================================
# Decode Backend Configuration
258
259
260
# ============================================================================


261
262
# Backend-specific properties that can't be inferred from the backend class
# Keys are AttentionBackendEnum names (uppercase)
263
_BACKEND_PROPERTIES = {
264
    "FLASHMLA": {
265
266
        "query_format": "concat",  # Single concatenated tensor (vs tuple)
    },
267
268
    "FLASHMLA_SPARSE": {
        "query_format": "concat",  # Single concatenated tensor (vs tuple)
269
270
271
272
273
274
    },
}


def _get_backend_config(backend: str) -> dict:
    """
275
276
277
278
279
280
281
282
283
284
285
    Get backend configuration from AttentionBackendEnum.

    Uses the registry to get the backend class and extract configuration
    from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.).

    Args:
        backend: Backend name matching AttentionBackendEnum exactly
        (e.g., "FLASHMLA_SPARSE")

    Returns:
        Dict with backend configuration
286
    """
287
    from vllm.v1.attention.backend import MultipleOf
288
    from vllm.v1.attention.backends.registry import AttentionBackendEnum
289

290
291
292
293
294
295
296
297
298
299
300
301
302
303
    try:
        backend_enum = AttentionBackendEnum[backend]
        backend_class = backend_enum.get_class()
    except (KeyError, ValueError) as e:
        valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"]
        raise ValueError(
            f"Unknown backend: {backend}. "
            f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}"
        ) from e

    # Get block size from backend class
    block_sizes = backend_class.get_supported_kernel_block_sizes()
    # Use first supported block size (backends typically support one for MLA)
    block_size = block_sizes[0] if block_sizes else None
304
305
    if isinstance(block_size, MultipleOf):
        # No fixed block size; fall back to config value
306
307
308
309
310
311
        block_size = None

    # Check if sparse via class method if available
    is_sparse = getattr(backend_class, "is_sparse", lambda: False)()

    # Get properties that can't be inferred
312
313
314
    props = _BACKEND_PROPERTIES.get(backend, {})

    return {
315
316
317
        "backend_class": backend_class,
        "impl_class": backend_class.get_impl_cls(),
        "builder_class": backend_class.get_builder_cls(),
318
        "query_format": props.get("query_format", "tuple"),
319
320
        "block_size": block_size,
        "is_sparse": is_sparse,
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
    }


# ============================================================================
# Metadata Building Helpers
# ============================================================================


def _build_attention_metadata(
    requests: list,
    block_size: int,
    device: torch.device,
    builder_instance,
) -> tuple:
    """
    Build attention metadata from batch requests.

    Args:
        requests: List of BatchRequest objects
        block_size: KV cache block size
        device: Target device
        builder_instance: Metadata builder instance

    Returns:
        Tuple of (metadata, kv_cache_num_blocks)
    """
    q_lens = [r.q_len for r in requests]
    kv_lens = [r.kv_len for r in requests]
    total_q = sum(q_lens)
    max_kv = max(kv_lens)

    # Build query start locations
    q_start_cpu = torch.tensor(
        [0] + [sum(q_lens[: i + 1]) for i in range(len(q_lens))],
        dtype=torch.int32,
    )
    q_start_gpu = q_start_cpu.to(device)

    # Build sequence lengths
    seq_lens_cpu = torch.tensor(kv_lens, dtype=torch.int32)
    seq_lens_gpu = seq_lens_cpu.to(device)

    # Build num_computed_tokens (context length for each request)
    context_lens = [kv_len - q_len for q_len, kv_len in zip(q_lens, kv_lens)]
    num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32)

    # Build block table
    num_blocks_per_req = [(kv + block_size - 1) // block_size for kv in kv_lens]
    max_num_blocks = max(num_blocks_per_req)

    block_table_cpu = np.zeros((len(requests), max_num_blocks), dtype=np.int32)
    current_block = 0
    for i, num_blocks in enumerate(num_blocks_per_req):
        for j in range(num_blocks):
            block_table_cpu[i, j] = current_block
            current_block += 1

    block_table_gpu = torch.from_numpy(block_table_cpu).to(device)

    # Build slot mapping
    slot_mapping_list = []
    for i, (q_len, kv_len, num_blocks) in enumerate(
        zip(q_lens, kv_lens, num_blocks_per_req)
    ):
        context_len = kv_len - q_len
        for j in range(q_len):
            token_kv_idx = context_len + j
            block_idx = token_kv_idx // block_size
            offset_in_block = token_kv_idx % block_size
            global_block_id = block_table_cpu[i, block_idx]
            slot_id = global_block_id * block_size + offset_in_block
            slot_mapping_list.append(slot_id)

    slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64, device=device)

    # Create CommonAttentionMetadata
    from vllm.v1.attention.backends.utils import CommonAttentionMetadata

    common_attn_metadata = CommonAttentionMetadata(
        num_reqs=len(requests),
        max_query_len=max(q_lens),
        max_seq_len=max_kv,
        num_actual_tokens=total_q,
        query_start_loc=q_start_gpu,
        query_start_loc_cpu=q_start_cpu,
        seq_lens=seq_lens_gpu,
407
        seq_lens_cpu_upper_bound=seq_lens_cpu,
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
        _seq_lens_cpu=seq_lens_cpu,
        _num_computed_tokens_cpu=num_computed_tokens_cpu,
        slot_mapping=slot_mapping,
        block_table_tensor=block_table_gpu,
        dcp_local_seq_lens=None,
    )

    # Use the production build() method
    metadata = builder_instance.build(
        common_prefix_len=0,
        common_attn_metadata=common_attn_metadata,
        fast_build=False,
    )

    return metadata, current_block


def _create_input_tensors(
    total_q: int,
    mla_dims: dict,
    query_format: str,
    device: torch.device,
    dtype: torch.dtype,
):
    """
    Create input tensors for both decode and prefill modes.

    MLA requires different tensor formats for decode vs prefill:
    - Decode: Uses kv_lora_rank (512) dimension
    - Prefill: Uses qk_nope_head_dim (128) to stay under FlashAttention's 256 limit

    Args:
        total_q: Total number of query tokens
        mla_dims: MLA dimension configuration
        query_format: Either "tuple" or "concat"
        device: Target device
        dtype: Tensor dtype

    Returns:
        Tuple of (decode_inputs, prefill_inputs)
        - decode_inputs: Query tensor(s) for decode mode
        - prefill_inputs: Dict with 'q', 'k_c_normed', 'k_pe', 'k_scale' for prefill
    """
    if query_format == "tuple":
        # Decode mode format: (q_nope, q_pe) where q_nope has kv_lora_rank dim
        q_nope_decode = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["kv_lora_rank"],
            device=device,
            dtype=dtype,
        )
        q_pe = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["qk_rope_head_dim"],
            device=device,
            dtype=dtype,
        )
        decode_inputs = (q_nope_decode, q_pe)

        # For prefill, we need q with qk_nope_head_dim instead of kv_lora_rank
        q_nope_prefill = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["qk_nope_head_dim"],
            device=device,
            dtype=dtype,
        )
        prefill_q = torch.cat([q_nope_prefill, q_pe], dim=-1)
    else:  # concat
        decode_inputs = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
            device=device,
            dtype=dtype,
        )
        # For prefill with concat format
        prefill_q = torch.randn(
            total_q,
            mla_dims["num_q_heads"],
            mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
            device=device,
            dtype=dtype,
        )

    # Create additional inputs needed for prefill forward
    k_c_normed = torch.randn(
        total_q,
        mla_dims["kv_lora_rank"],
        device=device,
        dtype=dtype,
    )
    k_pe = torch.randn(
        total_q,
        1,  # Single head for MLA
        mla_dims["qk_rope_head_dim"],
        device=device,
        dtype=dtype,
    )
    k_scale = torch.ones(1, device=device, dtype=torch.float32)

    output = torch.zeros(
        total_q,
        mla_dims["num_q_heads"] * mla_dims["v_head_dim"],
        device=device,
        dtype=dtype,
    )

    prefill_inputs = {
        "q": prefill_q,
        "k_c_normed": k_c_normed,
        "k_pe": k_pe,
        "k_scale": k_scale,
        "output": output,
    }

    return decode_inputs, prefill_inputs


# ============================================================================
# Backend Initialization
# ============================================================================


def _create_backend_impl(
    backend_cfg: dict,
    mla_dims: dict,
    vllm_config: VllmConfig,
    device: torch.device,
539
540
    max_num_tokens: int = 8192,
    index_topk: int | None = None,
541
    kv_cache_dtype: str = "auto",
542
543
544
545
546
):
    """
    Create backend implementation instance.

    Args:
547
        backend_cfg: Backend configuration dict from _get_backend_config()
548
549
550
        mla_dims: MLA dimension configuration
        vllm_config: VllmConfig instance
        device: Target device
551
552
        max_num_tokens: Maximum number of tokens for sparse indexer buffer
        index_topk: Topk value for sparse MLA backends
553
554

    Returns:
555
        Tuple of (impl, layer, builder_instance, indexer)
556
    """
557
558
559
    # Get classes from backend config (already resolved by _get_backend_config)
    impl_class = backend_cfg["impl_class"]
    builder_class = backend_cfg["builder_class"]
560
561
562
563
564
565
566
567
568
569
570

    # Calculate scale
    scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"])

    # Create mock kv_b_proj layer for prefill mode
    mock_kv_b_proj = MockKVBProj(
        num_heads=mla_dims["num_q_heads"],
        qk_nope_head_dim=mla_dims["qk_nope_head_dim"],
        v_head_dim=mla_dims["v_head_dim"],
    )

571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
    # Create indexer for sparse backends
    indexer = None
    if backend_cfg.get("is_sparse", False):
        if index_topk is None:
            index_topk = 2048  # Default topk for sparse MLA
        indexer = MockIndexer(
            max_num_tokens=max_num_tokens,
            topk_tokens=index_topk,
            device=device,
        )

    # Build impl kwargs
    impl_kwargs = {
        "num_heads": mla_dims["num_q_heads"],
        "head_size": mla_dims["head_dim"],
        "scale": scale,
        "num_kv_heads": mla_dims["num_kv_heads"],
        "alibi_slopes": None,
        "sliding_window": None,
590
        "kv_cache_dtype": kv_cache_dtype,
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
        "logits_soft_cap": None,
        "attn_type": "decoder",
        "kv_sharing_target_layer_name": None,
        "q_lora_rank": None,
        "kv_lora_rank": mla_dims["kv_lora_rank"],
        "qk_nope_head_dim": mla_dims["qk_nope_head_dim"],
        "qk_rope_head_dim": mla_dims["qk_rope_head_dim"],
        "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"],
        "v_head_dim": mla_dims["v_head_dim"],
        "kv_b_proj": mock_kv_b_proj,
    }

    # Add indexer for sparse backends
    if indexer is not None:
        impl_kwargs["indexer"] = indexer

607
    # Create impl
608
    impl = impl_class(**impl_kwargs)
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629

    # Initialize DCP attributes
    if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1):
        impl.dcp_world_size = 1
        impl.dcp_rank = 0

    # Create KV cache spec for MockLayer
    from vllm.v1.kv_cache_interface import FullAttentionSpec

    kv_cache_spec = FullAttentionSpec(
        block_size=backend_cfg["block_size"] or vllm_config.cache_config.block_size,
        num_kv_heads=1,  # MLA uses 1 KV head
        head_size=576,  # MLA head dim
        dtype=torch.bfloat16,
    )

    # Create mock layer
    layer = MockLayer(device, impl=impl, kv_cache_spec=kv_cache_spec)

    # Create builder instance if needed
    builder_instance = None
630
    if builder_class:
631
632
633
634
635
636
637
638
639
640
641
        # Populate static_forward_context so builder can find the layer
        # MockLayer inherits from AttentionLayerBase, so isinstance checks pass
        vllm_config.compilation_config.static_forward_context = {"placeholder": layer}

        builder_instance = builder_class(
            kv_cache_spec=kv_cache_spec,
            layer_names=["placeholder"],
            vllm_config=vllm_config,
            device=device,
        )

642
    return impl, layer, builder_instance, indexer
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706


# ============================================================================
# Config Helpers
# ============================================================================


def _extract_mla_dims_from_config(config) -> dict | None:
    """
    Extract MLA dimensions from BenchmarkConfig if all required fields are present.

    Args:
        config: BenchmarkConfig instance

    Returns:
        Dict with MLA dimensions if all fields are provided, None otherwise
    """
    # Check if all MLA-specific fields are provided
    if all(
        [
            config.kv_lora_rank is not None,
            config.qk_nope_head_dim is not None,
            config.qk_rope_head_dim is not None,
            config.v_head_dim is not None,
        ]
    ):
        return {
            "kv_lora_rank": config.kv_lora_rank,
            "qk_nope_head_dim": config.qk_nope_head_dim,
            "qk_rope_head_dim": config.qk_rope_head_dim,
            "v_head_dim": config.v_head_dim,
            "num_q_heads": config.num_q_heads,
            "num_kv_heads": config.num_kv_heads,
            "head_dim": config.head_dim,
        }
    # Fallback: if MLA fields not fully specified, try to construct from basic fields
    elif config.head_dim == 576:
        # This looks like a DeepSeek MLA config, use standard dimensions with custom
        # head count
        return {
            "kv_lora_rank": 512,
            "qk_nope_head_dim": 128,
            "qk_rope_head_dim": 64,
            "v_head_dim": 128,
            "num_q_heads": config.num_q_heads,
            "num_kv_heads": config.num_kv_heads,
            "head_dim": config.head_dim,
        }
    return None


# ============================================================================
# Benchmark Execution
# ============================================================================


def _run_single_benchmark(
    config,
    impl,
    layer,
    builder_instance,
    backend_cfg: dict,
    mla_dims: dict,
    device: torch.device,
707
    indexer=None,
708
    kv_cache_dtype: str | None = None,
709
710
711
712
713
714
715
716
717
718
719
720
) -> BenchmarkResult:
    """
    Run a single benchmark iteration.

    Args:
        config: BenchmarkConfig instance
        impl: Backend implementation instance
        layer: MockLayer instance
        builder_instance: Metadata builder instance
        backend_cfg: Backend configuration dict
        mla_dims: MLA dimension configuration
        device: Target device
721
        indexer: Optional MockIndexer for sparse backends
722
723
724
725
726
727
728

    Returns:
        BenchmarkResult with timing statistics
    """
    # Parse batch spec
    requests = parse_batch_spec(config.batch_spec)
    q_lens = [r.q_len for r in requests]
729
    kv_lens = [r.kv_len for r in requests]
730
    total_q = sum(q_lens)
731
    max_kv_len = max(kv_lens)
732
733
734
735
736
737
738
739
740
741

    # Determine block size
    block_size = backend_cfg["block_size"] or config.block_size

    # Build metadata
    metadata, num_blocks = _build_attention_metadata(
        requests, block_size, device, builder_instance
    )

    # Create KV cache
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
    if kv_cache_dtype is None:
        kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
    head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
    if kv_cache_dtype == "fp8_ds_mla":
        # FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
        # Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
        #         + 2*rope_dim bf16 bytes
        # = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
            656,
            device=device,
            dtype=torch.uint8,
        )
    elif kv_cache_dtype == "fp8":
        from vllm.platforms import current_platform
759

760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
            head_size,
            device=device,
            dtype=torch.uint8,
        ).view(current_platform.fp8_dtype())
    else:
        kv_cache = torch.zeros(
            num_blocks,
            block_size,
            head_size,
            device=device,
            dtype=torch.bfloat16,
        )
775

776
777
778
779
780
    # Fill indexer with random indices for sparse backends
    is_sparse = backend_cfg.get("is_sparse", False)
    if is_sparse and indexer is not None:
        indexer.fill_random_indices(total_q, max_kv_len)

781
782
783
784
785
    # Determine which forward methods to use based on metadata.
    # Sparse MLA backends always use forward_mqa
    has_decode = is_sparse or getattr(metadata, "decode", None) is not None
    has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
    if not has_decode and not has_prefill:
786
787
        raise RuntimeError("Metadata has neither decode nor prefill metadata")

788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
    num_decode = (
        metadata.num_decode_tokens
        if (has_decode and has_prefill)
        else total_q
        if has_decode
        else 0
    )
    num_prefill = total_q - num_decode

    # Some backends requires fp8 queries when using fp8 KV cache.
    is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
    quantize_query = is_fp8_kvcache and getattr(
        impl, "supports_quant_query_input", False
    )

    # quantize_query forces concat format
    query_fmt = "concat" if quantize_query else backend_cfg["query_format"]

    # Create decode query tensors
    if has_decode:
        decode_inputs, _ = _create_input_tensors(
            num_decode, mla_dims, query_fmt, device, torch.bfloat16
        )
        # Cast decode query to fp8 if the backend supports it
        if quantize_query:
            from vllm.platforms import current_platform

            if isinstance(decode_inputs, tuple):
                decode_inputs = torch.cat(list(decode_inputs), dim=-1)
            decode_inputs = decode_inputs.to(current_platform.fp8_dtype())

    # Create prefill input tensors
    if has_prefill:
        _, prefill_inputs = _create_input_tensors(
            num_prefill, mla_dims, query_fmt, device, torch.bfloat16
        )

    # Build forward function
    def forward_fn():
        results = []
        if has_decode:
            results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
        if has_prefill:
            results.append(
                impl.forward_mha(
                    prefill_inputs["q"],
                    prefill_inputs["k_c_normed"],
                    prefill_inputs["k_pe"],
                    kv_cache,
                    metadata,
                    prefill_inputs["k_scale"],
                    prefill_inputs["output"],
                )
            )
        return results[0] if len(results) == 1 else tuple(results)

844
845
846
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
847
    torch.accelerator.synchronize()
848

849
850
851
852
853
854
855
856
857
858
859
    # Optionally capture a CUDA graph after warmup.
    # Graph replay eliminates CPU launch overhead so timings reflect pure
    # kernel time.
    if config.use_cuda_graphs:
        graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(graph):
            forward_fn()
        benchmark_fn = graph.replay
    else:
        benchmark_fn = forward_fn

860
861
862
863
864
865
866
867
    # Benchmark
    times = []
    for _ in range(config.repeats):
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)

        start.record()
        for _ in range(config.num_layers):
868
            benchmark_fn()
869
870
        end.record()

871
        torch.accelerator.synchronize()
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
        elapsed_ms = start.elapsed_time(end)
        times.append(elapsed_ms / 1000.0 / config.num_layers)

    mean_time = float(np.mean(times))
    return BenchmarkResult(
        config=config,
        mean_time=mean_time,
        std_time=float(np.std(times)),
        min_time=float(np.min(times)),
        max_time=float(np.max(times)),
        throughput_tokens_per_sec=total_q / mean_time if mean_time > 0 else 0,
    )


def _run_mla_benchmark_batched(
    backend: str,
    configs_with_params: list[tuple],  # [(config, threshold, num_splits), ...]
889
    index_topk: int = 2048,
890
    prefill_backend: str | None = None,
891
892
893
894
) -> list[BenchmarkResult]:
    """
    Unified batched MLA benchmark runner for all backends.

895
896
    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
               flashinfer_mla_sparse, flashmla_sparse
897
898
899
900
901

    This function reuses backend initialization across multiple benchmarks
    to avoid setup/teardown overhead.

    Args:
902
        backend: Backend name (decode backend used for impl construction)
903
904
905
        configs_with_params: List of (config, threshold, num_splits) tuples
            - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only)
            - num_splits: num_kv_splits (CUTLASS only)
906
        index_topk: Topk value for sparse MLA backends (default 2048)
907
908
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
            When set, forces the specified FlashAttention version for prefill.
909
910
911
912
913
914
915
916
917

    Returns:
        List of BenchmarkResult objects
    """
    if not configs_with_params:
        return []

    backend_cfg = _get_backend_config(backend)
    device = torch.device(configs_with_params[0][0].device)
918
    torch.accelerator.set_device_index(device)
919
920
921
922
923
924
925
926
927
928
929
930
931

    # Determine block size
    config_block_size = configs_with_params[0][0].block_size
    block_size = backend_cfg["block_size"] or config_block_size

    # Extract MLA dimensions from the first config
    first_config = configs_with_params[0][0]
    mla_dims = _extract_mla_dims_from_config(first_config)

    # If config didn't provide MLA dims, fall back to default model
    if mla_dims is None:
        mla_dims = setup_mla_dims("deepseek-v3")

932
933
934
    # Determine if this is a sparse backend
    is_sparse = backend_cfg.get("is_sparse", False)

935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
    # Extract kv_cache_dtype from the first config
    kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")

    # FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
    # Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
    if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
        kv_cache_dtype = "fp8_ds_mla"

    # Compute max total_q across all configs so the metadata builder buffer
    # and scheduler config are large enough for all batch specs.
    max_total_q = max(
        sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
        for cfg, *_ in configs_with_params
    )

950
951
952
953
    # Create and set vLLM config for MLA (reused across all benchmarks)
    vllm_config = create_minimal_vllm_config(
        model_name="deepseek-v3",  # Used only for model path
        block_size=block_size,
954
        max_num_batched_tokens=max_total_q,
955
        mla_dims=mla_dims,  # Use custom dims from config or default
956
        index_topk=index_topk if is_sparse else None,
957
        prefill_backend=prefill_backend,
958
        kv_cache_dtype=kv_cache_dtype,
959
960
961
962
963
    )

    results = []

    with set_current_vllm_config(vllm_config):
964
965
966
967
968
969
970
971
972
973
974
975
976
        # Clear cached prefill backend detection functions so they re-evaluate
        # with the current VllmConfig. These are @functools.cache decorated and
        # would otherwise return stale results from a previous backend's config.
        from vllm.model_executor.layers.attention.mla_attention import (
            use_cudnn_prefill,
            use_flashinfer_prefill,
            use_trtllm_ragged_deepseek_prefill,
        )

        use_flashinfer_prefill.cache_clear()
        use_cudnn_prefill.cache_clear()
        use_trtllm_ragged_deepseek_prefill.cache_clear()

977
978
979
980
981
982
        # Create backend impl, layer, builder, and indexer (reused across benchmarks)
        impl, layer, builder_instance, indexer = _create_backend_impl(
            backend_cfg,
            mla_dims,
            vllm_config,
            device,
983
            max_num_tokens=max_total_q,
984
            index_topk=index_topk if is_sparse else None,
985
            kv_cache_dtype=kv_cache_dtype,
986
987
        )

988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
        # Verify the actual prefill backend matches what was requested
        if prefill_backend is not None:
            prefill_cfg = get_prefill_backend_config(prefill_backend)
            fa_version = prefill_cfg["flash_attn_version"]

            if fa_version is not None:
                # FA backend: verify the impl's FA version
                actual_fa_version = getattr(impl, "vllm_flash_attn_version", None)
                if actual_fa_version != fa_version:
                    raise RuntimeError(
                        f"Prefill backend '{prefill_backend}' requested FA "
                        f"version {fa_version}, but the impl is using FA "
                        f"version {actual_fa_version}. Check "
                        f"vllm/v1/attention/backends/fa_utils.py."
                    )
            else:
                # Non-FA backend: verify the builder picked the right path
                expected_flags = {
                    "flashinfer": "_use_fi_prefill",
                    "cudnn": "_use_cudnn_prefill",
                    "trtllm": "_use_trtllm_ragged_prefill",
                }
                flag_name = expected_flags.get(prefill_backend)
                if flag_name and not getattr(builder_instance, flag_name, False):
                    raise RuntimeError(
                        f"Prefill backend '{prefill_backend}' was requested "
                        f"but the metadata builder did not enable it. This "
                        f"usually means a dependency is missing (e.g., "
                        f"flashinfer not installed) or the platform doesn't "
                        f"support it."
                    )

1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
        # Run each benchmark with the shared impl
        for config, threshold, num_splits in configs_with_params:
            # Set threshold for this benchmark (FlashAttn/FlashMLA only)
            original_threshold = None
            if threshold is not None and builder_instance:
                original_threshold = builder_instance.reorder_batch_threshold
                builder_instance.reorder_batch_threshold = threshold

            # Set num_splits for CUTLASS
            original_num_splits = None
            if num_splits is not None and hasattr(impl, "_num_kv_splits"):
                original_num_splits = impl._num_kv_splits
                impl._num_kv_splits = num_splits

            try:
                result = _run_single_benchmark(
                    config,
                    impl,
                    layer,
                    builder_instance,
                    backend_cfg,
                    mla_dims,
                    device,
1043
                    indexer=indexer,
1044
                    kv_cache_dtype=kv_cache_dtype,
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
                )
                results.append(result)

            finally:
                # Restore original threshold
                if original_threshold is not None:
                    builder_instance.reorder_batch_threshold = original_threshold

                # Restore original num_splits
                if original_num_splits is not None:
                    impl._num_kv_splits = original_num_splits

    return results


# ============================================================================
# Public API
# ============================================================================


def run_mla_benchmark(
    backend: str,
    config,
    reorder_batch_threshold: int | None = None,
    num_kv_splits: int | None = None,
1070
    index_topk: int = 2048,
1071
    prefill_backend: str | None = None,
1072
1073
1074
1075
) -> BenchmarkResult | list[BenchmarkResult]:
    """
    Unified MLA benchmark runner for all backends.

1076
1077
    Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
               flashinfer_mla_sparse, flashmla_sparse
1078
1079
1080
1081

    Always uses batched execution internally for optimal performance.

    Args:
1082
1083
        backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla,
                 flashinfer_mla_sparse, flashmla_sparse)
1084
1085
1086
1087
        config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples
        reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA
                                 (single config mode only)
        num_kv_splits: Number of KV splits for CUTLASS (single config mode only)
1088
        index_topk: Topk value for sparse MLA backends (default 2048)
1089
1090
        prefill_backend: Prefill backend name (e.g., "fa3", "fa4").
            When set, forces the specified FlashAttention version for prefill.
1091
1092
1093
1094
1095
1096
1097
1098
1099

    Returns:
        BenchmarkResult (single mode) or list of BenchmarkResult (batched mode)
    """
    # Normalize to batched mode: (config, threshold, num_splits)
    if isinstance(config, list):
        # Already in batched format
        if len(config) > 0 and isinstance(config[0], tuple):
            # Format: [(cfg, param), ...] where param is threshold or num_splits
1100
            if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"):
1101
                configs_with_params = [(cfg, param, None) for cfg, param in config]
1102
            else:  # cutlass_mla, flashinfer_mla, or sparse backends
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
                configs_with_params = [(cfg, None, param) for cfg, param in config]
        else:
            # Format: [cfg, ...] - just configs
            configs_with_params = [(cfg, None, None) for cfg in config]
        return_single = False
    else:
        # Single config: convert to batched format
        configs_with_params = [(config, reorder_batch_threshold, num_kv_splits)]
        return_single = True

    # Use unified batched execution
1114
1115
1116
    results = _run_mla_benchmark_batched(
        backend, configs_with_params, index_topk, prefill_backend=prefill_backend
    )
1117
1118
1119

    # Return single result or list based on input
    return results[0] if return_single else results