Merge remote-tracking branch 'origin/v0.8.5.post1-dev_yql' into v0.8.5.post1-dev

5fa14eef · zhuwenwen · 3b5d646e · a94ed3ea · 5fa14eef · 5fa14eef
Commit 5fa14eef authored May 22, 2025 by zhuwenwen
6 changed files
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -237,14 +237,35 @@ class AWQLinearMethod(LinearMethodBase):
            default_execution(input_size_per_partition,output_size_per_partition)

    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if not envs.VLLM_USE_TRITON_AWQ:
+            
+            group_size= self.quant_config.group_size 
+            pad_group=2 
+            dim_n = layer.scales.data.shape[1]
+            dim_k = layer.qweight.data.shape[0]
+            _qw, _sz=ops.convert_s4(layer.qweight,layer.qzeros,layer.scales,int(group_size)) 
+            sz = ops.sz_permute(_sz).reshape(-1,dim_n)  
+            sz = sz.reshape(dim_n,-1)
+            _qw = _qw.reshape(dim_n,-1)
+            
+            if dim_k % 4096==0 and self.use_awq_pad:
+                zeros_and_scalse_pad = torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
+                sz = torch.cat((sz,zeros_and_scalse_pad),dim=1).contiguous()
+                qweight_pad = torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
+                _qw=torch.cat((_qw,qweight_pad),dim=1).contiguous()
+                        
+            layer.qweight = torch.nn.Parameter(_qw, requires_grad=False)
+            layer.zeros_and_scales = torch.nn.Parameter(sz, requires_grad=False)
+            layer.qzeros = None
+            layer.scales = None
+        else:
+
            layer.qweight = torch.nn.Parameter(layer.qweight.data,
                                            requires_grad=False)
            layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
                                            requires_grad=False)
            layer.scales = torch.nn.Parameter(layer.scales.data,
                                            requires_grad=False)
-        layer.zeros_and_scales = torch.nn.Parameter(layer.zeros_and_scales.data,
-                                          requires_grad=False)

    def apply(self,
              layer: torch.nn.Module,

--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -424,47 +424,47 @@ class BaiChuanModel(nn.Module):
            os.environ['LM_NN'] = '0'
            os.environ['LLAMA_NN'] = '0'
            
-        if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
-            lay_key_words = [
-                "self_attn.W_pack.qweight",
-                "self_attn.o_proj.qweight",
-                "mlp.gate_up_proj.qweight",
-                "mlp.down_proj.qweight"
-            ]
-            combined_words = "|".join(lay_key_words)
-            
-            for layername in loaded_params:
-                weight = params_dict[layername]
-                
-                matches = re.findall(combined_words, layername)
-                if matches:
-                    qweight =params_dict[layername]
-                    qzeros=params_dict[layername.replace("qweight", "qzeros")]
-                    scales=params_dict[layername.replace("qweight", "scales")]
-                    zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
-                    
-                    group_size= self.quant_config.group_size 
-                   
-                    dim_n = scales.data.shape[1]
-                    dim_k = qweight.data.shape[0]
-                    pad_group=2              
-                    
-                    _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
-                    
-                    sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
-                    
-                    zeros_and_scalse.data.copy_(sz)
-                    qweight.data.copy_(_qw)
-                    
-                    #reshape
-                    zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
-                    qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
-                
-                    if dim_k % 4096==0 and self.use_awq_pad:
-                        zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
-                        zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
-                        qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
-                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()  
+        # if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+        #     lay_key_words = [
+        #         "self_attn.W_pack.qweight",
+        #         "self_attn.o_proj.qweight",
+        #         "mlp.gate_up_proj.qweight",
+        #         "mlp.down_proj.qweight"
+        #     ]
+        #     combined_words = "|".join(lay_key_words)
+            
+        #     for layername in loaded_params:
+        #         weight = params_dict[layername]
+                
+        #         matches = re.findall(combined_words, layername)
+        #         if matches:
+        #             qweight =params_dict[layername]
+        #             qzeros=params_dict[layername.replace("qweight", "qzeros")]
+        #             scales=params_dict[layername.replace("qweight", "scales")]
+        #             zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
+                    
+        #             group_size= self.quant_config.group_size 
+                   
+        #             dim_n = scales.data.shape[1]
+        #             dim_k = qweight.data.shape[0]
+        #             pad_group=2              
+                    
+        #             _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
+                    
+        #             sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
+                    
+        #             zeros_and_scalse.data.copy_(sz)
+        #             qweight.data.copy_(_qw)
+                    
+        #             #reshape
+        #             zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
+        #             qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
+                
+        #             if dim_k % 4096==0 and self.use_awq_pad:
+        #                 zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
+        #                 zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
+        #                 qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
+        #                 qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()  
        return loaded_params



--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -911,34 +911,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
                weight = params_dict[layername]
                
                matches = re.findall(combined_words, layername)
-                if matches:
-                    qweight =params_dict[layername]
-                    qzeros=params_dict[layername.replace("qweight", "qzeros")]
-                    scales=params_dict[layername.replace("qweight", "scales")]
-                    zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
-                    
-                    group_size= self.quant_config.group_size 
-                   
-                    dim_n = scales.data.shape[1]
-                    dim_k = qweight.data.shape[0]
-                    pad_group=2              
-                    
-                    _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
-                    
-                    sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
-                    
-                    zeros_and_scalse.data.copy_(sz)
-                    qweight.data.copy_(_qw)
-                    
-                    #reshape
-                    zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
-                    qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  

-                    if dim_k % 4096==0 and self.use_awq_pad:
-                        zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
-                        zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
-                        qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
-                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
                        
                if self.use_w4a16_moe_sz:
                    matches_moe = re.findall(moe_combined_words, layername)

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -518,47 +518,47 @@ class LlamaModel(nn.Module):
            os.environ['LM_NN'] = '0'
            os.environ['LLAMA_NN'] = '0'
            
-        if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
-            lay_key_words = [
-                "self_attn.qkv_proj.qweight",
-                "self_attn.o_proj.qweight",
-                "mlp.gate_up_proj.qweight",
-                "mlp.down_proj.qweight"
-            ]
-            combined_words = "|".join(lay_key_words)
+        # if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+        #     lay_key_words = [
+        #         "self_attn.qkv_proj.qweight",
+        #         "self_attn.o_proj.qweight",
+        #         "mlp.gate_up_proj.qweight",
+        #         "mlp.down_proj.qweight"
+        #     ]
+        #     combined_words = "|".join(lay_key_words)
            
-            for layername in loaded_params:
-                weight = params_dict[layername]
+        #     for layername in loaded_params:
+        #         weight = params_dict[layername]
                
-                matches = re.findall(combined_words, layername)
-                if matches:
-                    qweight =params_dict[layername]
-                    qzeros=params_dict[layername.replace("qweight", "qzeros")]
-                    scales=params_dict[layername.replace("qweight", "scales")]
-                    zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
+        #         matches = re.findall(combined_words, layername)
+        #         if matches:
+        #             qweight =params_dict[layername]
+        #             qzeros=params_dict[layername.replace("qweight", "qzeros")]
+        #             scales=params_dict[layername.replace("qweight", "scales")]
+        #             zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
                    
-                    group_size= self.quant_config.group_size 
+        #             group_size= self.quant_config.group_size 
                   
-                    dim_n = scales.data.shape[1]
-                    dim_k = qweight.data.shape[0]
-                    pad_group=2              
+        #             dim_n = scales.data.shape[1]
+        #             dim_k = qweight.data.shape[0]
+        #             pad_group=2              
                    
-                    _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
+        #             _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
                    
-                    sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
+        #             sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
                    
-                    zeros_and_scalse.data.copy_(sz)
-                    qweight.data.copy_(_qw)
+        #             zeros_and_scalse.data.copy_(sz)
+        #             qweight.data.copy_(_qw)
                    
-                    #reshape
-                    zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
-                    qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
+        #             #reshape
+        #             zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
+        #             qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
                
-                    if dim_k % 4096==0 and self.use_awq_pad:
-                        zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
-                        zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
-                        qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
-                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
+        #             if dim_k % 4096==0 and self.use_awq_pad:
+        #                 zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
+        #                 zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
+        #                 qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
+        #                 qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
            
        #当为triton支持推理的时候不能进行处理
        if self.quant_method == "compressed_tensors":

--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -385,48 +385,48 @@ class QWenBaseModel(nn.Module):
                    
                    weight.data=weight.data.reshape(ori_shape[1],-1)
                    
-        if self.quant_method == "awq":
-            os.environ['LM_NN'] = '0'
-            lay_key_words = [
-                "attn.c_attn.qweight",
-                "attn.c_proj.qweight",
-                "mlp.gate_up_proj.qweight",
-                "mlp.c_proj.qweight"
-            ]
-            combined_words = "|".join(lay_key_words)
-            
-            for layername in loaded_params:
-                weight = params_dict[layername]
-                
-                matches = re.findall(combined_words, layername)
-                if matches:
-                    qweight =params_dict[layername]
-                    qzeros=params_dict[layername.replace("qweight", "qzeros")]
-                    scales=params_dict[layername.replace("qweight", "scales")]
-                    zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
-                    
-                    group_size= self.quant_config.group_size 
-                   
-                    dim_n = scales.data.shape[1]
-                    dim_k = qweight.data.shape[0]
-                    pad_group=2              
-                    
-                    _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
-                    
-                    sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
-                    
-                    zeros_and_scalse.data.copy_(sz)
-                    qweight.data.copy_(_qw)
-                    
-                    #reshape
-                    zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
-                    qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
-                
-                    if dim_k % 4096==0 and self.use_awq_pad:
-                        zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
-                        zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
-                        qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
-                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
+        # if self.quant_method == "awq":
+        #     os.environ['LM_NN'] = '0'
+        #     lay_key_words = [
+        #         "attn.c_attn.qweight",
+        #         "attn.c_proj.qweight",
+        #         "mlp.gate_up_proj.qweight",
+        #         "mlp.c_proj.qweight"
+        #     ]
+        #     combined_words = "|".join(lay_key_words)
+            
+        #     for layername in loaded_params:
+        #         weight = params_dict[layername]
+                
+        #         matches = re.findall(combined_words, layername)
+        #         if matches:
+        #             qweight =params_dict[layername]
+        #             qzeros=params_dict[layername.replace("qweight", "qzeros")]
+        #             scales=params_dict[layername.replace("qweight", "scales")]
+        #             zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
+                    
+        #             group_size= self.quant_config.group_size 
+                   
+        #             dim_n = scales.data.shape[1]
+        #             dim_k = qweight.data.shape[0]
+        #             pad_group=2              
+                    
+        #             _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
+                    
+        #             sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
+                    
+        #             zeros_and_scalse.data.copy_(sz)
+        #             qweight.data.copy_(_qw)
+                    
+        #             #reshape
+        #             zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
+        #             qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
+                
+        #             if dim_k % 4096==0 and self.use_awq_pad:
+        #                 zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
+        #                 zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
+        #                 qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
+        #                 qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
                       
        if self.quant_method == "compressed_tensors":
            os.environ['LM_NN'] = '0'

--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -486,47 +486,47 @@ class Qwen2Model(nn.Module):
            os.environ['LM_NN'] = '0'
            os.environ['LLAMA_NN'] = '0'
            
-        if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
-            lay_key_words = [
-                "self_attn.qkv_proj.qweight",
-                "self_attn.o_proj.qweight",
-                "mlp.gate_up_proj.qweight",
-                "mlp.down_proj.qweight"
-            ]
-            combined_words = "|".join(lay_key_words)
+        # if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
+        #     lay_key_words = [
+        #         "self_attn.qkv_proj.qweight",
+        #         "self_attn.o_proj.qweight",
+        #         "mlp.gate_up_proj.qweight",
+        #         "mlp.down_proj.qweight"
+        #     ]
+        #     combined_words = "|".join(lay_key_words)
            
-            for layername in loaded_params:
-                weight = params_dict[layername]
+        #     for layername in loaded_params:
+        #         weight = params_dict[layername]
                
-                matches = re.findall(combined_words, layername)
-                if matches:
-                    qweight =params_dict[layername]
-                    qzeros=params_dict[layername.replace("qweight", "qzeros")]
-                    scales=params_dict[layername.replace("qweight", "scales")]
-                    zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
+        #         matches = re.findall(combined_words, layername)
+        #         if matches:
+        #             qweight =params_dict[layername]
+        #             qzeros=params_dict[layername.replace("qweight", "qzeros")]
+        #             scales=params_dict[layername.replace("qweight", "scales")]
+        #             zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
                    
-                    group_size= self.quant_config.group_size 
+        #             group_size= self.quant_config.group_size 
                   
-                    dim_n = scales.data.shape[1]
-                    dim_k = qweight.data.shape[0]
-                    pad_group=2              
+        #             dim_n = scales.data.shape[1]
+        #             dim_k = qweight.data.shape[0]
+        #             pad_group=2              
                    
-                    _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
+        #             _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
                    
-                    sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
+        #             sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
                    
-                    zeros_and_scalse.data.copy_(sz)
-                    qweight.data.copy_(_qw)
+        #             zeros_and_scalse.data.copy_(sz)
+        #             qweight.data.copy_(_qw)
                    
-                    #reshape
-                    zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
-                    qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
+        #             #reshape
+        #             zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
+        #             qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
                
-                    if dim_k % 4096==0 and self.use_awq_pad:
-                        zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
-                        zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
-                        qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
-                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
+        #             if dim_k % 4096==0 and self.use_awq_pad:
+        #                 zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
+        #                 zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
+        #                 qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
+        #                 qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
                   
        if self.quant_method == "compressed_tensors":
            lay_key_words = [