Commit 5fa14eef authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'origin/v0.8.5.post1-dev_yql' into v0.8.5.post1-dev

parents 3b5d646e a94ed3ea
...@@ -237,14 +237,35 @@ class AWQLinearMethod(LinearMethodBase): ...@@ -237,14 +237,35 @@ class AWQLinearMethod(LinearMethodBase):
default_execution(input_size_per_partition,output_size_per_partition) default_execution(input_size_per_partition,output_size_per_partition)
def process_weights_after_loading(self, layer: torch.nn.Module) -> None: def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
layer.qweight = torch.nn.Parameter(layer.qweight.data, if not envs.VLLM_USE_TRITON_AWQ:
requires_grad=False)
layer.qzeros = torch.nn.Parameter(layer.qzeros.data, group_size= self.quant_config.group_size
requires_grad=False) pad_group=2
layer.scales = torch.nn.Parameter(layer.scales.data, dim_n = layer.scales.data.shape[1]
requires_grad=False) dim_k = layer.qweight.data.shape[0]
layer.zeros_and_scales = torch.nn.Parameter(layer.zeros_and_scales.data, _qw, _sz=ops.convert_s4(layer.qweight,layer.qzeros,layer.scales,int(group_size))
requires_grad=False) sz = ops.sz_permute(_sz).reshape(-1,dim_n)
sz = sz.reshape(dim_n,-1)
_qw = _qw.reshape(dim_n,-1)
if dim_k % 4096==0 and self.use_awq_pad:
zeros_and_scalse_pad = torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
sz = torch.cat((sz,zeros_and_scalse_pad),dim=1).contiguous()
qweight_pad = torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
_qw=torch.cat((_qw,qweight_pad),dim=1).contiguous()
layer.qweight = torch.nn.Parameter(_qw, requires_grad=False)
layer.zeros_and_scales = torch.nn.Parameter(sz, requires_grad=False)
layer.qzeros = None
layer.scales = None
else:
layer.qweight = torch.nn.Parameter(layer.qweight.data,
requires_grad=False)
layer.qzeros = torch.nn.Parameter(layer.qzeros.data,
requires_grad=False)
layer.scales = torch.nn.Parameter(layer.scales.data,
requires_grad=False)
def apply(self, def apply(self,
layer: torch.nn.Module, layer: torch.nn.Module,
......
...@@ -424,47 +424,47 @@ class BaiChuanModel(nn.Module): ...@@ -424,47 +424,47 @@ class BaiChuanModel(nn.Module):
os.environ['LM_NN'] = '0' os.environ['LM_NN'] = '0'
os.environ['LLAMA_NN'] = '0' os.environ['LLAMA_NN'] = '0'
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ: # if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
lay_key_words = [ # lay_key_words = [
"self_attn.W_pack.qweight", # "self_attn.W_pack.qweight",
"self_attn.o_proj.qweight", # "self_attn.o_proj.qweight",
"mlp.gate_up_proj.qweight", # "mlp.gate_up_proj.qweight",
"mlp.down_proj.qweight" # "mlp.down_proj.qweight"
] # ]
combined_words = "|".join(lay_key_words) # combined_words = "|".join(lay_key_words)
for layername in loaded_params: # for layername in loaded_params:
weight = params_dict[layername] # weight = params_dict[layername]
matches = re.findall(combined_words, layername) # matches = re.findall(combined_words, layername)
if matches: # if matches:
qweight =params_dict[layername] # qweight =params_dict[layername]
qzeros=params_dict[layername.replace("qweight", "qzeros")] # qzeros=params_dict[layername.replace("qweight", "qzeros")]
scales=params_dict[layername.replace("qweight", "scales")] # scales=params_dict[layername.replace("qweight", "scales")]
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")] # zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size= self.quant_config.group_size # group_size= self.quant_config.group_size
dim_n = scales.data.shape[1] # dim_n = scales.data.shape[1]
dim_k = qweight.data.shape[0] # dim_k = qweight.data.shape[0]
pad_group=2 # pad_group=2
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) # _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz = ops.sz_permute(_sz).reshape(-1,dim_n) # sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse.data.copy_(sz) # zeros_and_scalse.data.copy_(sz)
qweight.data.copy_(_qw) # qweight.data.copy_(_qw)
#reshape # #reshape
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size] # zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8] # qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if dim_k % 4096==0 and self.use_awq_pad: # if dim_k % 4096==0 and self.use_awq_pad:
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda() # zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous() # zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda() # qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous() # qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
return loaded_params return loaded_params
......
...@@ -911,34 +911,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP): ...@@ -911,34 +911,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
weight = params_dict[layername] weight = params_dict[layername]
matches = re.findall(combined_words, layername) matches = re.findall(combined_words, layername)
if matches:
qweight =params_dict[layername]
qzeros=params_dict[layername.replace("qweight", "qzeros")]
scales=params_dict[layername.replace("qweight", "scales")]
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size= self.quant_config.group_size
dim_n = scales.data.shape[1]
dim_k = qweight.data.shape[0]
pad_group=2
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse.data.copy_(sz)
qweight.data.copy_(_qw)
#reshape
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if dim_k % 4096==0 and self.use_awq_pad:
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
if self.use_w4a16_moe_sz: if self.use_w4a16_moe_sz:
matches_moe = re.findall(moe_combined_words, layername) matches_moe = re.findall(moe_combined_words, layername)
......
...@@ -518,47 +518,47 @@ class LlamaModel(nn.Module): ...@@ -518,47 +518,47 @@ class LlamaModel(nn.Module):
os.environ['LM_NN'] = '0' os.environ['LM_NN'] = '0'
os.environ['LLAMA_NN'] = '0' os.environ['LLAMA_NN'] = '0'
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ: # if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
lay_key_words = [ # lay_key_words = [
"self_attn.qkv_proj.qweight", # "self_attn.qkv_proj.qweight",
"self_attn.o_proj.qweight", # "self_attn.o_proj.qweight",
"mlp.gate_up_proj.qweight", # "mlp.gate_up_proj.qweight",
"mlp.down_proj.qweight" # "mlp.down_proj.qweight"
] # ]
combined_words = "|".join(lay_key_words) # combined_words = "|".join(lay_key_words)
for layername in loaded_params: # for layername in loaded_params:
weight = params_dict[layername] # weight = params_dict[layername]
matches = re.findall(combined_words, layername) # matches = re.findall(combined_words, layername)
if matches: # if matches:
qweight =params_dict[layername] # qweight =params_dict[layername]
qzeros=params_dict[layername.replace("qweight", "qzeros")] # qzeros=params_dict[layername.replace("qweight", "qzeros")]
scales=params_dict[layername.replace("qweight", "scales")] # scales=params_dict[layername.replace("qweight", "scales")]
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")] # zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size= self.quant_config.group_size # group_size= self.quant_config.group_size
dim_n = scales.data.shape[1] # dim_n = scales.data.shape[1]
dim_k = qweight.data.shape[0] # dim_k = qweight.data.shape[0]
pad_group=2 # pad_group=2
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) # _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz = ops.sz_permute(_sz).reshape(-1,dim_n) # sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse.data.copy_(sz) # zeros_and_scalse.data.copy_(sz)
qweight.data.copy_(_qw) # qweight.data.copy_(_qw)
#reshape # #reshape
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size] # zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8] # qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if dim_k % 4096==0 and self.use_awq_pad: # if dim_k % 4096==0 and self.use_awq_pad:
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda() # zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous() # zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda() # qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous() # qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
#当为triton支持推理的时候不能进行处理 #当为triton支持推理的时候不能进行处理
if self.quant_method == "compressed_tensors": if self.quant_method == "compressed_tensors":
......
...@@ -385,48 +385,48 @@ class QWenBaseModel(nn.Module): ...@@ -385,48 +385,48 @@ class QWenBaseModel(nn.Module):
weight.data=weight.data.reshape(ori_shape[1],-1) weight.data=weight.data.reshape(ori_shape[1],-1)
if self.quant_method == "awq": # if self.quant_method == "awq":
os.environ['LM_NN'] = '0' # os.environ['LM_NN'] = '0'
lay_key_words = [ # lay_key_words = [
"attn.c_attn.qweight", # "attn.c_attn.qweight",
"attn.c_proj.qweight", # "attn.c_proj.qweight",
"mlp.gate_up_proj.qweight", # "mlp.gate_up_proj.qweight",
"mlp.c_proj.qweight" # "mlp.c_proj.qweight"
] # ]
combined_words = "|".join(lay_key_words) # combined_words = "|".join(lay_key_words)
for layername in loaded_params: # for layername in loaded_params:
weight = params_dict[layername] # weight = params_dict[layername]
matches = re.findall(combined_words, layername) # matches = re.findall(combined_words, layername)
if matches: # if matches:
qweight =params_dict[layername] # qweight =params_dict[layername]
qzeros=params_dict[layername.replace("qweight", "qzeros")] # qzeros=params_dict[layername.replace("qweight", "qzeros")]
scales=params_dict[layername.replace("qweight", "scales")] # scales=params_dict[layername.replace("qweight", "scales")]
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")] # zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size= self.quant_config.group_size # group_size= self.quant_config.group_size
dim_n = scales.data.shape[1] # dim_n = scales.data.shape[1]
dim_k = qweight.data.shape[0] # dim_k = qweight.data.shape[0]
pad_group=2 # pad_group=2
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) # _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz = ops.sz_permute(_sz).reshape(-1,dim_n) # sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse.data.copy_(sz) # zeros_and_scalse.data.copy_(sz)
qweight.data.copy_(_qw) # qweight.data.copy_(_qw)
#reshape # #reshape
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size] # zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8] # qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if dim_k % 4096==0 and self.use_awq_pad: # if dim_k % 4096==0 and self.use_awq_pad:
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda() # zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous() # zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda() # qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous() # qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
if self.quant_method == "compressed_tensors": if self.quant_method == "compressed_tensors":
os.environ['LM_NN'] = '0' os.environ['LM_NN'] = '0'
......
...@@ -486,47 +486,47 @@ class Qwen2Model(nn.Module): ...@@ -486,47 +486,47 @@ class Qwen2Model(nn.Module):
os.environ['LM_NN'] = '0' os.environ['LM_NN'] = '0'
os.environ['LLAMA_NN'] = '0' os.environ['LLAMA_NN'] = '0'
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ: # if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
lay_key_words = [ # lay_key_words = [
"self_attn.qkv_proj.qweight", # "self_attn.qkv_proj.qweight",
"self_attn.o_proj.qweight", # "self_attn.o_proj.qweight",
"mlp.gate_up_proj.qweight", # "mlp.gate_up_proj.qweight",
"mlp.down_proj.qweight" # "mlp.down_proj.qweight"
] # ]
combined_words = "|".join(lay_key_words) # combined_words = "|".join(lay_key_words)
for layername in loaded_params: # for layername in loaded_params:
weight = params_dict[layername] # weight = params_dict[layername]
matches = re.findall(combined_words, layername) # matches = re.findall(combined_words, layername)
if matches: # if matches:
qweight =params_dict[layername] # qweight =params_dict[layername]
qzeros=params_dict[layername.replace("qweight", "qzeros")] # qzeros=params_dict[layername.replace("qweight", "qzeros")]
scales=params_dict[layername.replace("qweight", "scales")] # scales=params_dict[layername.replace("qweight", "scales")]
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")] # zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size= self.quant_config.group_size # group_size= self.quant_config.group_size
dim_n = scales.data.shape[1] # dim_n = scales.data.shape[1]
dim_k = qweight.data.shape[0] # dim_k = qweight.data.shape[0]
pad_group=2 # pad_group=2
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) # _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz = ops.sz_permute(_sz).reshape(-1,dim_n) # sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse.data.copy_(sz) # zeros_and_scalse.data.copy_(sz)
qweight.data.copy_(_qw) # qweight.data.copy_(_qw)
#reshape # #reshape
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size] # zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8] # qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if dim_k % 4096==0 and self.use_awq_pad: # if dim_k % 4096==0 and self.use_awq_pad:
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda() # zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous() # zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda() # qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous() # qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
if self.quant_method == "compressed_tensors": if self.quant_method == "compressed_tensors":
lay_key_words = [ lay_key_words = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment