Fix：GLM-5量化模型mla_attention layout修复&&sparse_attn fp8支持

5b9ad722 · lixh6 · a56e3da7 · 5b9ad722 · 5b9ad722
Commit 5b9ad722 authored Mar 12, 2026 by lixh6
Showing with 24 additions and 9 deletions

vllm/model_executor/layers/attention/mla_attention.py vllm/model_executor/layers/attention/mla_attention.py +13 -7

vllm/model_executor/layers/sparse_attn_indexer.py vllm/model_executor/layers/sparse_attn_indexer.py +11 -2

No files found.
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -1258,11 +1258,17 @@ class MLACommonBaseImpl(MLAAttentionImpl[A], Generic[A]):
                self.kv_b_proj, out_dtype=act_dtype
            ).T

-        assert kv_b_proj_weight.shape == (
+        expected_shape = (
            self.kv_lora_rank,
            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
-        ), (
-            f"{kv_b_proj_weight.shape=}, "
+        )
+        if kv_b_proj_weight.shape != expected_shape:
+            if kv_b_proj_weight.T.shape == expected_shape:
+                kv_b_proj_weight = kv_b_proj_weight.T.contiguous()
+            else:
+                raise ValueError(
+                    f"kv_b_proj_weight.shape={kv_b_proj_weight.shape}, "
+                    f"expected={expected_shape}, "
                    f"{self.kv_lora_rank=}, "
                    f"{self.num_heads=}, "
                    f"{self.qk_nope_head_dim=}, "

--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -94,7 +94,7 @@ def sparse_attn_indexer(
                ((total_seq_lens, 4), torch.uint8),
            )
        for chunk in prefill_metadata.chunks:
-            if not current_platform.is_rocm() or torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938":                       
+            if not current_platform.is_rocm(): # or torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938":                       
                k_fp8 = k_fp8_full[: chunk.total_seq_lens]
                k_scale = k_scale_full[: chunk.total_seq_lens]
                ops.cp_gather_indexer_k_quant_cache(
@@ -113,6 +113,15 @@ def sparse_attn_indexer(
                    chunk.cu_seqlen_ke,
                )
            else:
+                k_fp8 = k_fp8_full[: chunk.total_seq_lens]
+                k_scale = k_scale_full[: chunk.total_seq_lens]
+                ops.cp_gather_indexer_k_quant_cache(
+                    kv_cache,
+                    k_fp8,
+                    k_scale,
+                    chunk.block_table,
+                    chunk.cu_seq_lens,
+                )                
                logits = op.mqa_logits(
                    q_fp8[chunk.token_start:chunk.token_end],  
                    k,