支持fp8 mqa&&跳过VLLM_USE_FUSED_FILL_RMS_CAT&&跳过load_error

c5fa1992 · liuchy5 · 3824b261 · c5fa1992 · c5fa1992 · c5fa1992
Commit c5fa1992 authored Mar 11, 2026 by liuchy5
3 changed files
--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -112,6 +112,29 @@ def sparse_attn_indexer(
                    chunk.cu_seqlen_ks,
                    chunk.cu_seqlen_ke,
                )
+            elif torch.cuda.get_device_properties("cuda").gcnArchName.split(':')[0] == "gfx938":
+                k_fp8 = k_fp8_full[: chunk.total_seq_lens]
+                k_scale = k_scale_full[: chunk.total_seq_lens]
+                ops.cp_gather_indexer_k_quant_cache(
+                    kv_cache,
+                    k_fp8,
+                    k_scale,
+                    chunk.block_table,
+                    chunk.cu_seq_lens,
+                )
+                logits = op.mqa_logits(
+                    q_fp8[chunk.token_start:chunk.token_end],  
+                    k_fp8, 
+                    weights[chunk.token_start:chunk.token_end], 
+                    chunk.cu_seqlen_ks, 
+                    chunk.cu_seqlen_ke,
+                    q_fp8[chunk.token_start:chunk.token_end].shape[0],
+                    k_fp8.shape[0],
+                    q_fp8.shape[1],
+                    q_fp8.shape[2],
+                    k_scale.view(torch.float32).flatten(),
+                    True
+                    )
            else:                
                logits = op.mqa_logits(
                    q_fp8[chunk.token_start:chunk.token_end],  

--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -298,6 +298,7 @@ class DefaultModelLoader(BaseModelLoader):
        if model_config.quantization is None and loaded_weights is not None:
            weights_not_loaded = weights_to_load - loaded_weights
            weights_not_loaded = {k for k in weights_not_loaded if not k.endswith("indexer.weights_proj.bias")}
+            weights_not_loaded = {k for k in weights_not_loaded if k not in ['model.layers.78.shared_head.head.weight', 'model.embed_tokens.weight']}
            if weights_not_loaded:
                raise ValueError(
                    "Following weights were not initialized from "

--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -112,7 +112,7 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
    ) -> torch.Tensor:
        assert inputs_embeds is not None
        # masking inputs at position 0, as not needed by MTP
-        if envs.VLLM_USE_FUSED_FILL_RMS_CAT:
+        if False:
            hidden_states_fuse = torch.empty(inputs_embeds.shape[0], inputs_embeds.shape[1]*2, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
            torch.ops.vllm.fuse_fill_rms_x2_concat(hidden_states_fuse, positions, inputs_embeds, previous_hidden_states, self.enorm.weight, self.hnorm.weight, self.enorm.variance_epsilon)
            hidden_states = self.eh_proj(hidden_states_fuse)