Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5fa14eef
Commit
5fa14eef
authored
May 22, 2025
by
zhuwenwen
Browse files
Merge remote-tracking branch 'origin/v0.8.5.post1-dev_yql' into v0.8.5.post1-dev
parents
3b5d646e
a94ed3ea
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
159 additions
and
165 deletions
+159
-165
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq.py
+29
-8
vllm/model_executor/models/baichuan.py
vllm/model_executor/models/baichuan.py
+32
-32
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+1
-28
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+32
-32
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+33
-33
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+32
-32
No files found.
vllm/model_executor/layers/quantization/awq.py
View file @
5fa14eef
...
...
@@ -237,14 +237,35 @@ class AWQLinearMethod(LinearMethodBase):
default_execution
(
input_size_per_partition
,
output_size_per_partition
)
def
process_weights_after_loading
(
self
,
layer
:
torch
.
nn
.
Module
)
->
None
:
layer
.
qweight
=
torch
.
nn
.
Parameter
(
layer
.
qweight
.
data
,
requires_grad
=
False
)
layer
.
qzeros
=
torch
.
nn
.
Parameter
(
layer
.
qzeros
.
data
,
requires_grad
=
False
)
layer
.
scales
=
torch
.
nn
.
Parameter
(
layer
.
scales
.
data
,
requires_grad
=
False
)
layer
.
zeros_and_scales
=
torch
.
nn
.
Parameter
(
layer
.
zeros_and_scales
.
data
,
requires_grad
=
False
)
if
not
envs
.
VLLM_USE_TRITON_AWQ
:
group_size
=
self
.
quant_config
.
group_size
pad_group
=
2
dim_n
=
layer
.
scales
.
data
.
shape
[
1
]
dim_k
=
layer
.
qweight
.
data
.
shape
[
0
]
_qw
,
_sz
=
ops
.
convert_s4
(
layer
.
qweight
,
layer
.
qzeros
,
layer
.
scales
,
int
(
group_size
))
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
sz
=
sz
.
reshape
(
dim_n
,
-
1
)
_qw
=
_qw
.
reshape
(
dim_n
,
-
1
)
if
dim_k
%
4096
==
0
and
self
.
use_awq_pad
:
zeros_and_scalse_pad
=
torch
.
zeros
(
dim_n
,
pad_group
,
dtype
=
torch
.
int32
).
cuda
()
sz
=
torch
.
cat
((
sz
,
zeros_and_scalse_pad
),
dim
=
1
).
contiguous
()
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
_qw
=
torch
.
cat
((
_qw
,
qweight_pad
),
dim
=
1
).
contiguous
()
layer
.
qweight
=
torch
.
nn
.
Parameter
(
_qw
,
requires_grad
=
False
)
layer
.
zeros_and_scales
=
torch
.
nn
.
Parameter
(
sz
,
requires_grad
=
False
)
layer
.
qzeros
=
None
layer
.
scales
=
None
else
:
layer
.
qweight
=
torch
.
nn
.
Parameter
(
layer
.
qweight
.
data
,
requires_grad
=
False
)
layer
.
qzeros
=
torch
.
nn
.
Parameter
(
layer
.
qzeros
.
data
,
requires_grad
=
False
)
layer
.
scales
=
torch
.
nn
.
Parameter
(
layer
.
scales
.
data
,
requires_grad
=
False
)
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
...
...
vllm/model_executor/models/baichuan.py
View file @
5fa14eef
...
...
@@ -424,47 +424,47 @@ class BaiChuanModel(nn.Module):
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.W_pack.qweight"
,
"self_attn.o_proj.qweight"
,
"mlp.gate_up_proj.qweight"
,
"mlp.down_proj.qweight"
]
combined_words
=
"|"
.
join
(
lay_key_words
)
#
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
#
lay_key_words = [
#
"self_attn.W_pack.qweight",
#
"self_attn.o_proj.qweight",
#
"mlp.gate_up_proj.qweight",
#
"mlp.down_proj.qweight"
#
]
#
combined_words = "|".join(lay_key_words)
for
layername
in
loaded_params
:
weight
=
params_dict
[
layername
]
#
for layername in loaded_params:
#
weight = params_dict[layername]
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
qweight
=
params_dict
[
layername
]
qzeros
=
params_dict
[
layername
.
replace
(
"qweight"
,
"qzeros"
)]
scales
=
params_dict
[
layername
.
replace
(
"qweight"
,
"scales"
)]
zeros_and_scalse
=
params_dict
[
layername
.
replace
(
"qweight"
,
"zeros_and_scales"
)]
#
matches = re.findall(combined_words, layername)
#
if matches:
#
qweight =params_dict[layername]
#
qzeros=params_dict[layername.replace("qweight", "qzeros")]
#
scales=params_dict[layername.replace("qweight", "scales")]
#
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size
=
self
.
quant_config
.
group_size
#
group_size= self.quant_config.group_size
dim_n
=
scales
.
data
.
shape
[
1
]
dim_k
=
qweight
.
data
.
shape
[
0
]
pad_group
=
2
#
dim_n = scales.data.shape[1]
#
dim_k = qweight.data.shape[0]
#
pad_group=2
_qw
,
_sz
=
ops
.
convert_s4
(
qweight
,
qzeros
,
scales
,
int
(
group_size
))
#
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
#
sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse
.
data
.
copy_
(
sz
)
qweight
.
data
.
copy_
(
_qw
)
#
zeros_and_scalse.data.copy_(sz)
#
qweight.data.copy_(_qw)
#reshape
zeros_and_scalse
.
data
=
zeros_and_scalse
.
reshape
(
dim_n
,
-
1
)
#[k/greop_size,n]------>[n,k/group_size]
qweight
.
data
=
qweight
.
data
.
reshape
(
dim_n
,
-
1
)
#[k,n/8]---->[n,k/8]
#
#reshape
#
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
#
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if
dim_k
%
4096
==
0
and
self
.
use_awq_pad
:
zeros_and_scalse_pad
=
torch
.
zeros
(
dim_n
,
pad_group
,
dtype
=
torch
.
int32
).
cuda
()
zeros_and_scalse
.
data
=
torch
.
cat
((
zeros_and_scalse
.
data
,
zeros_and_scalse_pad
),
dim
=
1
).
contiguous
()
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
#
if dim_k % 4096==0 and self.use_awq_pad:
#
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
#
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
#
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
#
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
return
loaded_params
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
5fa14eef
...
...
@@ -911,34 +911,7 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
weight
=
params_dict
[
layername
]
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
qweight
=
params_dict
[
layername
]
qzeros
=
params_dict
[
layername
.
replace
(
"qweight"
,
"qzeros"
)]
scales
=
params_dict
[
layername
.
replace
(
"qweight"
,
"scales"
)]
zeros_and_scalse
=
params_dict
[
layername
.
replace
(
"qweight"
,
"zeros_and_scales"
)]
group_size
=
self
.
quant_config
.
group_size
dim_n
=
scales
.
data
.
shape
[
1
]
dim_k
=
qweight
.
data
.
shape
[
0
]
pad_group
=
2
_qw
,
_sz
=
ops
.
convert_s4
(
qweight
,
qzeros
,
scales
,
int
(
group_size
))
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
zeros_and_scalse
.
data
.
copy_
(
sz
)
qweight
.
data
.
copy_
(
_qw
)
#reshape
zeros_and_scalse
.
data
=
zeros_and_scalse
.
reshape
(
dim_n
,
-
1
)
#[k/greop_size,n]------>[n,k/group_size]
qweight
.
data
=
qweight
.
data
.
reshape
(
dim_n
,
-
1
)
#[k,n/8]---->[n,k/8]
if
dim_k
%
4096
==
0
and
self
.
use_awq_pad
:
zeros_and_scalse_pad
=
torch
.
zeros
(
dim_n
,
pad_group
,
dtype
=
torch
.
int32
).
cuda
()
zeros_and_scalse
.
data
=
torch
.
cat
((
zeros_and_scalse
.
data
,
zeros_and_scalse_pad
),
dim
=
1
).
contiguous
()
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
if
self
.
use_w4a16_moe_sz
:
matches_moe
=
re
.
findall
(
moe_combined_words
,
layername
)
...
...
vllm/model_executor/models/llama.py
View file @
5fa14eef
...
...
@@ -518,47 +518,47 @@ class LlamaModel(nn.Module):
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.qkv_proj.qweight"
,
"self_attn.o_proj.qweight"
,
"mlp.gate_up_proj.qweight"
,
"mlp.down_proj.qweight"
]
combined_words
=
"|"
.
join
(
lay_key_words
)
#
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
#
lay_key_words = [
#
"self_attn.qkv_proj.qweight",
#
"self_attn.o_proj.qweight",
#
"mlp.gate_up_proj.qweight",
#
"mlp.down_proj.qweight"
#
]
#
combined_words = "|".join(lay_key_words)
for
layername
in
loaded_params
:
weight
=
params_dict
[
layername
]
#
for layername in loaded_params:
#
weight = params_dict[layername]
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
qweight
=
params_dict
[
layername
]
qzeros
=
params_dict
[
layername
.
replace
(
"qweight"
,
"qzeros"
)]
scales
=
params_dict
[
layername
.
replace
(
"qweight"
,
"scales"
)]
zeros_and_scalse
=
params_dict
[
layername
.
replace
(
"qweight"
,
"zeros_and_scales"
)]
#
matches = re.findall(combined_words, layername)
#
if matches:
#
qweight =params_dict[layername]
#
qzeros=params_dict[layername.replace("qweight", "qzeros")]
#
scales=params_dict[layername.replace("qweight", "scales")]
#
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size
=
self
.
quant_config
.
group_size
#
group_size= self.quant_config.group_size
dim_n
=
scales
.
data
.
shape
[
1
]
dim_k
=
qweight
.
data
.
shape
[
0
]
pad_group
=
2
#
dim_n = scales.data.shape[1]
#
dim_k = qweight.data.shape[0]
#
pad_group=2
_qw
,
_sz
=
ops
.
convert_s4
(
qweight
,
qzeros
,
scales
,
int
(
group_size
))
#
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
#
sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse
.
data
.
copy_
(
sz
)
qweight
.
data
.
copy_
(
_qw
)
#
zeros_and_scalse.data.copy_(sz)
#
qweight.data.copy_(_qw)
#reshape
zeros_and_scalse
.
data
=
zeros_and_scalse
.
reshape
(
dim_n
,
-
1
)
#[k/greop_size,n]------>[n,k/group_size]
qweight
.
data
=
qweight
.
data
.
reshape
(
dim_n
,
-
1
)
#[k,n/8]---->[n,k/8]
#
#reshape
#
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
#
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if
dim_k
%
4096
==
0
and
self
.
use_awq_pad
:
zeros_and_scalse_pad
=
torch
.
zeros
(
dim_n
,
pad_group
,
dtype
=
torch
.
int32
).
cuda
()
zeros_and_scalse
.
data
=
torch
.
cat
((
zeros_and_scalse
.
data
,
zeros_and_scalse_pad
),
dim
=
1
).
contiguous
()
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
#
if dim_k % 4096==0 and self.use_awq_pad:
#
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
#
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
#
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
#
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
#当为triton支持推理的时候不能进行处理
if
self
.
quant_method
==
"compressed_tensors"
:
...
...
vllm/model_executor/models/qwen.py
View file @
5fa14eef
...
...
@@ -385,48 +385,48 @@ class QWenBaseModel(nn.Module):
weight
.
data
=
weight
.
data
.
reshape
(
ori_shape
[
1
],
-
1
)
if
self
.
quant_method
==
"awq"
:
os
.
environ
[
'LM_NN'
]
=
'0'
lay_key_words
=
[
"attn.c_attn.qweight"
,
"attn.c_proj.qweight"
,
"mlp.gate_up_proj.qweight"
,
"mlp.c_proj.qweight"
]
combined_words
=
"|"
.
join
(
lay_key_words
)
#
if self.quant_method == "awq":
#
os.environ['LM_NN'] = '0'
#
lay_key_words = [
#
"attn.c_attn.qweight",
#
"attn.c_proj.qweight",
#
"mlp.gate_up_proj.qweight",
#
"mlp.c_proj.qweight"
#
]
#
combined_words = "|".join(lay_key_words)
for
layername
in
loaded_params
:
weight
=
params_dict
[
layername
]
#
for layername in loaded_params:
#
weight = params_dict[layername]
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
qweight
=
params_dict
[
layername
]
qzeros
=
params_dict
[
layername
.
replace
(
"qweight"
,
"qzeros"
)]
scales
=
params_dict
[
layername
.
replace
(
"qweight"
,
"scales"
)]
zeros_and_scalse
=
params_dict
[
layername
.
replace
(
"qweight"
,
"zeros_and_scales"
)]
#
matches = re.findall(combined_words, layername)
#
if matches:
#
qweight =params_dict[layername]
#
qzeros=params_dict[layername.replace("qweight", "qzeros")]
#
scales=params_dict[layername.replace("qweight", "scales")]
#
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size
=
self
.
quant_config
.
group_size
#
group_size= self.quant_config.group_size
dim_n
=
scales
.
data
.
shape
[
1
]
dim_k
=
qweight
.
data
.
shape
[
0
]
pad_group
=
2
#
dim_n = scales.data.shape[1]
#
dim_k = qweight.data.shape[0]
#
pad_group=2
_qw
,
_sz
=
ops
.
convert_s4
(
qweight
,
qzeros
,
scales
,
int
(
group_size
))
#
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
#
sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse
.
data
.
copy_
(
sz
)
qweight
.
data
.
copy_
(
_qw
)
#
zeros_and_scalse.data.copy_(sz)
#
qweight.data.copy_(_qw)
#reshape
zeros_and_scalse
.
data
=
zeros_and_scalse
.
reshape
(
dim_n
,
-
1
)
#[k/greop_size,n]------>[n,k/group_size]
qweight
.
data
=
qweight
.
data
.
reshape
(
dim_n
,
-
1
)
#[k,n/8]---->[n,k/8]
#
#reshape
#
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
#
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if
dim_k
%
4096
==
0
and
self
.
use_awq_pad
:
zeros_and_scalse_pad
=
torch
.
zeros
(
dim_n
,
pad_group
,
dtype
=
torch
.
int32
).
cuda
()
zeros_and_scalse
.
data
=
torch
.
cat
((
zeros_and_scalse
.
data
,
zeros_and_scalse_pad
),
dim
=
1
).
contiguous
()
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
#
if dim_k % 4096==0 and self.use_awq_pad:
#
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
#
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
#
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
#
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
if
self
.
quant_method
==
"compressed_tensors"
:
os
.
environ
[
'LM_NN'
]
=
'0'
...
...
vllm/model_executor/models/qwen2.py
View file @
5fa14eef
...
...
@@ -486,47 +486,47 @@ class Qwen2Model(nn.Module):
os
.
environ
[
'LM_NN'
]
=
'0'
os
.
environ
[
'LLAMA_NN'
]
=
'0'
if
self
.
quant_method
==
"awq"
and
not
envs
.
VLLM_USE_TRITON_AWQ
:
lay_key_words
=
[
"self_attn.qkv_proj.qweight"
,
"self_attn.o_proj.qweight"
,
"mlp.gate_up_proj.qweight"
,
"mlp.down_proj.qweight"
]
combined_words
=
"|"
.
join
(
lay_key_words
)
#
if self.quant_method == "awq" and not envs.VLLM_USE_TRITON_AWQ:
#
lay_key_words = [
#
"self_attn.qkv_proj.qweight",
#
"self_attn.o_proj.qweight",
#
"mlp.gate_up_proj.qweight",
#
"mlp.down_proj.qweight"
#
]
#
combined_words = "|".join(lay_key_words)
for
layername
in
loaded_params
:
weight
=
params_dict
[
layername
]
#
for layername in loaded_params:
#
weight = params_dict[layername]
matches
=
re
.
findall
(
combined_words
,
layername
)
if
matches
:
qweight
=
params_dict
[
layername
]
qzeros
=
params_dict
[
layername
.
replace
(
"qweight"
,
"qzeros"
)]
scales
=
params_dict
[
layername
.
replace
(
"qweight"
,
"scales"
)]
zeros_and_scalse
=
params_dict
[
layername
.
replace
(
"qweight"
,
"zeros_and_scales"
)]
#
matches = re.findall(combined_words, layername)
#
if matches:
#
qweight =params_dict[layername]
#
qzeros=params_dict[layername.replace("qweight", "qzeros")]
#
scales=params_dict[layername.replace("qweight", "scales")]
#
zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
group_size
=
self
.
quant_config
.
group_size
#
group_size= self.quant_config.group_size
dim_n
=
scales
.
data
.
shape
[
1
]
dim_k
=
qweight
.
data
.
shape
[
0
]
pad_group
=
2
#
dim_n = scales.data.shape[1]
#
dim_k = qweight.data.shape[0]
#
pad_group=2
_qw
,
_sz
=
ops
.
convert_s4
(
qweight
,
qzeros
,
scales
,
int
(
group_size
))
#
_qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size))
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
#
sz = ops.sz_permute(_sz).reshape(-1,dim_n)
zeros_and_scalse
.
data
.
copy_
(
sz
)
qweight
.
data
.
copy_
(
_qw
)
#
zeros_and_scalse.data.copy_(sz)
#
qweight.data.copy_(_qw)
#reshape
zeros_and_scalse
.
data
=
zeros_and_scalse
.
reshape
(
dim_n
,
-
1
)
#[k/greop_size,n]------>[n,k/group_size]
qweight
.
data
=
qweight
.
data
.
reshape
(
dim_n
,
-
1
)
#[k,n/8]---->[n,k/8]
#
#reshape
#
zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1) #[k/greop_size,n]------>[n,k/group_size]
#
qweight.data=qweight.data.reshape(dim_n,-1) #[k,n/8]---->[n,k/8]
if
dim_k
%
4096
==
0
and
self
.
use_awq_pad
:
zeros_and_scalse_pad
=
torch
.
zeros
(
dim_n
,
pad_group
,
dtype
=
torch
.
int32
).
cuda
()
zeros_and_scalse
.
data
=
torch
.
cat
((
zeros_and_scalse
.
data
,
zeros_and_scalse_pad
),
dim
=
1
).
contiguous
()
qweight_pad
=
torch
.
zeros
(
dim_n
,
int
(
group_size
//
4
),
dtype
=
torch
.
int32
).
cuda
()
qweight
.
data
=
torch
.
cat
((
qweight
.
data
,
qweight_pad
),
dim
=
1
).
contiguous
()
#
if dim_k % 4096==0 and self.use_awq_pad:
#
zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
#
zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
#
qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
#
qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
if
self
.
quant_method
==
"compressed_tensors"
:
lay_key_words
=
[
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment