[Bugfix] fix DeepSeek R1 with CUTLASS MLA Broken on B200 (#33637)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>

[Bugfix] fix DeepSeek R1 with CUTLASS MLA Broken on B200 (#33637)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
a7be77be · Chauncey · GitHub · bbe0574d · a7be77be
Unverified Commit a7be77be authored Feb 05, 2026 by Chauncey Committed by GitHub Feb 05, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 4 deletions

vllm/model_executor/layers/attention/mla_attention.py vllm/model_executor/layers/attention/mla_attention.py +1 -4

No files found.
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -293,7 +293,6 @@ class MLAAttention(nn.Module, AttentionLayerBase):
        prefix: str = "",
        use_sparse: bool = False,
        indexer: object | None = None,
-        q_pad_num_heads: int | None = None,
        **extra_impl_args,
    ):
        super().__init__()
@@ -308,7 +307,6 @@ class MLAAttention(nn.Module, AttentionLayerBase):
        self.head_size = kv_lora_rank + qk_rope_head_dim
        self.layer_name = prefix
        self.indexer = indexer
-        self.q_pad_num_heads = q_pad_num_heads
        self.num_kv_heads = 1
        self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
@@ -375,10 +373,9 @@ class MLAAttention(nn.Module, AttentionLayerBase):
            v_head_dim=self.v_head_dim,
            kv_b_proj=kv_b_proj,
            indexer=indexer,
-            q_pad_num_heads=q_pad_num_heads,
            **extra_impl_args,
        )
+        self.q_pad_num_heads = getattr(self.impl, "q_pad_num_heads", None)
        self.use_direct_call = not current_platform.opaque_attention_op()
        compilation_config = get_current_vllm_config().compilation_config