适配deepseekv3\v2 moe awq的推理支持

47bd229c · yangql · 4a734b9d · 47bd229c · 47bd229c · 47bd229c
Commit 47bd229c authored Feb 20, 2025 by yangql
11 changed files
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2048_K100_AI.json
+{
+    "7168_2048": {
+        "1": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "2": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "3": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "4": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "5": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "6": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "7": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "8": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "9": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "10": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "11": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "12": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "13": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "14": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "15": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "16": {
+            "BLOCK_SIZE_M": 16,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 8,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "32": {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 4,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 0
+        },
+        "64": {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "128": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 32,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 0,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "256": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "512": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 4,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "1024": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 64,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        },
+        "2048": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 64,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 8,
+            "num_ldmatrixes": 1
+        },
+        "4096": {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "GROUP_SIZE_M": 8,
+            "SPLIT_K": 1,
+            "num_stages": 1,
+            "num_warps": 4,
+            "num_ldmatrixes": 1
+        }
+    }
+}
\ No newline at end of file
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_BW200.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_BW200.json
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_2304_K100_AI.json
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_BW200.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_BW200.json
--- a/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_K100_AI.json
+++ b/vllm/model_executor/layers/quantization/configs/awq/AWQ_7168_256_K100_AI.json
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -277,6 +277,10 @@ class MoeWNA16Method(FusedMoEMethodBase):
        custom_routing_function: Optional[Callable] = None,
        scoring_func: str = "softmax",
        e_score_correction_bias: Optional[torch.Tensor] = None,
+        use_nn_moe: Optional[bool] = False,
+        moe_ep_size: Optional[int] = None,
+        start_expert: Optional[int] = None,
+        end_expert: Optional[int] = None,
    ) -> torch.Tensor:
        from vllm.model_executor.layers.fused_moe import fused_experts

@@ -307,7 +311,9 @@ class MoeWNA16Method(FusedMoEMethodBase):
                             w2_scale=layer.w2_scales,
                             w1_zp=layer.w13_qzeros if has_zp else None,
                             w2_zp=layer.w2_qzeros if has_zp else None,
-                             block_shape=[0, layer.group_size])
+                             block_shape=[0, layer.group_size],
+                             use_nn_moe=False,
+                             )

    @staticmethod
    def get_weight_loader(layer, weight_loader):

--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -27,7 +27,7 @@ from torch import nn
 from transformers import PretrainedConfig
 import os
 import re
-
+import vllm.envs as envs
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -517,9 +517,11 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                    weight.data.copy_(_weight)
                    
                    weight.data=weight.data.reshape(ori_shape[1], -1)
-
-        if self.quant_method == "awq":
+        else:
            os.environ['LM_NN'] = '0'
+            os.environ['LLAMA_NN'] = '0'
+            
+        if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
            lay_key_words = [
                "self_attn.W_pack.qweight",
                "self_attn.o_proj.qweight",

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -25,7 +25,7 @@
 import os
 import re
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
-
+import vllm.envs as envs
 import torch
 from torch import nn
 from transformers import PretrainedConfig
@@ -666,8 +666,17 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
+        # 暂时awq不支持cutlass
+        envs.VLLM_USE_TRITON_AWQ = True
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
+
+        self.quant_method = None
+        if quant_config is not None:
+            self.quant_method = quant_config.get_name()
+            os.environ['LLAMA_NN'] = '0'
+            os.environ['LM_NN'] = '0'
+
        self.config = config
        self.quant_config = quant_config
        self.parallel_config = vllm_config.parallel_config
@@ -683,12 +692,8 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors)
-        
-        self.quant_method = None
-        if quant_config is not None:
-            self.quant_method=quant_config.get_name()
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
-
+        self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
        
    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
        return self.model.get_input_embeddings(input_ids)
@@ -870,6 +875,53 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
                    weight.data.copy_(_weight)
                    
                    weight.data=weight.data.reshape(ori_shape[1],-1)
+        # 暂时不支持TN   
+        if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+            lay_key_words = [
+                "self_attn.q_a_proj.qweight",
+                "self_attn.q_b_proj.qweight",
+                "self_attn.kv_a_proj_with_mqa.qweight",
+                "self_attn.kv_b_proj.qweight",
+                "self_attn.o_proj.qweight",
+                "mlp.gate_up_proj.qweight",
+                "mlp.down_proj.qweight",
+                "mlp.shared_experts.gate_up_proj.qweight",
+                "mlp.shared_experts.down_proj.qweight"
+            ]
+            combined_words = "|".join(lay_key_words)
+            
+            for layername in loaded_params:
+                weight = params_dict[layername]
+                
+                matches = re.findall(combined_words, layername)
+                if matches:
+                    qweight =params_dict[layername]
+                    qzeros=params_dict[layername.replace("qweight", "qzeros")]
+                    scales=params_dict[layername.replace("qweight", "scales")]
+                    zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
+                    
+                    group_size= self.quant_config.group_size 
+                   
+                    dim_n = scales.data.shape[1]
+                    dim_k = qweight.data.shape[0]
+                    pad_group=2              
+                    
+                    _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
+                    
+                    sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
+                    
+                    zeros_and_scalse.data.copy_(sz)
+                    qweight.data.copy_(_qw)
+                    
+                    #reshape
+                    zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
+                    qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
+                
+                    if dim_k % 4096==0 and self.use_awq_pad:
+                        zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
+                        zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
+                        qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
+                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()

        return loaded_params


--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py