support deepseek_v2 nn layout

a8b2d878 · zhuwenwen · 211835ef · a8b2d878 · a8b2d878
Commit a8b2d878 authored Feb 13, 2025 by zhuwenwen
Showing with 5 additions and 7 deletions

vllm/model_executor/model_loader/utils.py vllm/model_executor/model_loader/utils.py +1 -1

vllm/model_executor/models/deepseek_v2.py vllm/model_executor/models/deepseek_v2.py +4 -6

No files found.
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -80,7 +80,7 @@ def get_model_architecture(
    architectures = getattr(model_config.hf_config, "architectures", [])
    visions = getattr(model_config.hf_config, "visual", []) or getattr(model_config.hf_config, "vision_config", [])
    support_nn_architectures = ['LlamaForCausalLM', 'QWenLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2MoeForCausalLM', 'ChatGLMModel', 'ChatGLMForConditionalGeneration', 
-                                'BaichuanForCausalLM', 'BloomForCausalLM', 'MedusaModel', 'MixtralForCausalLM', 'MLPSpeculatorPreTrainedModel', 'FalconForCausalLM', 'DeepseekV3ForCausalLM']  
+                                'BaichuanForCausalLM', 'BloomForCausalLM', 'MedusaModel', 'MixtralForCausalLM', 'MLPSpeculatorPreTrainedModel', 'FalconForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM']  
    if any(arch in architectures for arch in support_nn_architectures): 
        if os.getenv('LLAMA_NN') != '0': 
             if (architectures == ['QWenLMHeadModel'] or architectures == ['ChatGLMModel'] ) and visions != []:

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -843,17 +843,15 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
            if self.use_llama_nn and self.quant_method is None:
                lay_key_words = [
-                    "self_attn.q_a_proj.weight",
+                    "self_attn.q_proj.weight",
                    "self_attn.kv_a_proj_with_mqa.weight",
-                    "mlp.gate.weight",
+                    "self_attn.kv_b_proj.weight",
+                    "self_attn.o_proj.weight",
                    "mlp.gate_up_proj.weight",
                    "mlp.down_proj",
+                    "mlp.gate.weight",
                    "shared_experts.gate_up_proj",
                    "shared_experts.down_proj",
-                    "self_attn.q_proj.weight",
-                    "self_attn.q_b_proj.weight",
-                    "self_attn.kv_b_proj.weight",
-                    "self_attn.o_proj.weight",
                    "lm_head.weight"
                ]