Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2cfcb974
Commit
2cfcb974
authored
Sep 11, 2025
by
zhuwenwen
Browse files
新增dpsk-v3.1-awq的支持
parent
a02a1c83
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
24 additions
and
10 deletions
+24
-10
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq.py
+2
-2
vllm/model_executor/layers/quantization/awq_marlin.py
vllm/model_executor/layers/quantization/awq_marlin.py
+5
-2
vllm/model_executor/layers/quantization/moe_wna16.py
vllm/model_executor/layers/quantization/moe_wna16.py
+10
-4
vllm/model_executor/layers/quantization/utils/marlin_utils.py
.../model_executor/layers/quantization/utils/marlin_utils.py
+7
-2
No files found.
vllm/model_executor/layers/quantization/awq.py
View file @
2cfcb974
...
@@ -130,7 +130,7 @@ class AWQConfig(QuantizationConfig):
...
@@ -130,7 +130,7 @@ class AWQConfig(QuantizationConfig):
return
"awq"
return
"awq"
def
get_supported_act_dtypes
(
self
)
->
list
[
torch
.
dtype
]:
def
get_supported_act_dtypes
(
self
)
->
list
[
torch
.
dtype
]:
return
[
torch
.
half
]
return
[
torch
.
half
,
torch
.
bfloat16
]
@
classmethod
@
classmethod
def
get_min_capability
(
cls
)
->
int
:
def
get_min_capability
(
cls
)
->
int
:
...
@@ -293,7 +293,7 @@ class AWQLinearMethod(LinearMethodBase):
...
@@ -293,7 +293,7 @@ class AWQLinearMethod(LinearMethodBase):
pad_group
=
2
pad_group
=
2
dim_n
=
layer
.
scales
.
data
.
shape
[
1
]
dim_n
=
layer
.
scales
.
data
.
shape
[
1
]
dim_k
=
layer
.
qweight
.
data
.
shape
[
0
]
dim_k
=
layer
.
qweight
.
data
.
shape
[
0
]
_qw
,
_sz
=
ops
.
convert_s4
(
layer
.
qweight
,
layer
.
qzeros
,
layer
.
scales
,
int
(
group_size
))
_qw
,
_sz
=
ops
.
convert_s4
(
layer
.
qweight
,
layer
.
qzeros
,
layer
.
scales
.
to
(
torch
.
float16
)
,
int
(
group_size
))
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
sz
=
sz
.
reshape
(
dim_n
,
-
1
)
sz
=
sz
.
reshape
(
dim_n
,
-
1
)
_qw
=
_qw
.
reshape
(
dim_n
,
-
1
)
_qw
=
_qw
.
reshape
(
dim_n
,
-
1
)
...
...
vllm/model_executor/layers/quantization/awq_marlin.py
View file @
2cfcb974
...
@@ -141,6 +141,9 @@ class AWQMarlinConfig(QuantizationConfig):
...
@@ -141,6 +141,9 @@ class AWQMarlinConfig(QuantizationConfig):
self
.
full_config
).
get_quant_method
(
layer
,
prefix
)
self
.
full_config
).
get_quant_method
(
layer
,
prefix
)
return
AWQMarlinLinearMethod
(
self
)
return
AWQMarlinLinearMethod
(
self
)
elif
isinstance
(
layer
,
FusedMoE
):
elif
isinstance
(
layer
,
FusedMoE
):
if
is_layer_skipped_awq
(
prefix
,
getattr
(
self
,
"modules_to_not_convert"
,
[])):
return
UnquantizedFusedMoEMethod
(
layer
.
moe_config
)
from
vllm.model_executor.layers.quantization.moe_wna16
import
(
from
vllm.model_executor.layers.quantization.moe_wna16
import
(
MoeWNA16Config
)
MoeWNA16Config
)
if
is_layer_skipped_awq
(
if
is_layer_skipped_awq
(
...
@@ -448,7 +451,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
...
@@ -448,7 +451,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
# Why does this take the intermediate size for size_k?
# Why does this take the intermediate size for size_k?
marlin_w13_scales
=
marlin_moe_permute_scales
(
marlin_w13_scales
=
marlin_moe_permute_scales
(
s
=
layer
.
w13_scales
,
s
=
layer
.
w13_scales
.
to
(
torch
.
float16
)
,
size_k
=
layer
.
intermediate_size_per_partition
,
size_k
=
layer
.
intermediate_size_per_partition
,
size_n
=
layer
.
w13_scales
.
shape
[
2
],
size_n
=
layer
.
w13_scales
.
shape
[
2
],
group_size
=
self
.
quant_config
.
group_size
,
group_size
=
self
.
quant_config
.
group_size
,
...
@@ -457,7 +460,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
...
@@ -457,7 +460,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
#replace_parameter(layer, "w13_scales", marlin_w13_scales)
#replace_parameter(layer, "w13_scales", marlin_w13_scales)
marlin_w2_scales
=
marlin_moe_permute_scales
(
marlin_w2_scales
=
marlin_moe_permute_scales
(
s
=
layer
.
w2_scales
,
s
=
layer
.
w2_scales
.
to
(
torch
.
float16
)
,
size_k
=
layer
.
intermediate_size_per_partition
,
size_k
=
layer
.
intermediate_size_per_partition
,
size_n
=
layer
.
w2_scales
.
shape
[
2
],
size_n
=
layer
.
w2_scales
.
shape
[
2
],
group_size
=
self
.
quant_config
.
group_size
,
group_size
=
self
.
quant_config
.
group_size
,
...
...
vllm/model_executor/layers/quantization/moe_wna16.py
View file @
2cfcb974
...
@@ -7,7 +7,8 @@ import torch
...
@@ -7,7 +7,8 @@ import torch
import
os
import
os
from
vllm.distributed
import
get_tensor_model_parallel_rank
,
get_tp_group
from
vllm.distributed
import
get_tensor_model_parallel_rank
,
get_tp_group
from
vllm.model_executor.layers.fused_moe.layer
import
(
from
vllm.model_executor.layers.fused_moe.layer
import
(
FusedMoE
,
FusedMoEConfig
,
FusedMoEMethodBase
,
FusedMoeWeightScaleSupported
)
FusedMoE
,
FusedMoEConfig
,
FusedMoEMethodBase
,
FusedMoeWeightScaleSupported
,
UnquantizedFusedMoEMethod
)
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
UnquantizedLinearMethod
)
UnquantizedLinearMethod
)
from
vllm.model_executor.layers.quantization
import
QuantizationMethods
from
vllm.model_executor.layers.quantization
import
QuantizationMethods
...
@@ -19,6 +20,8 @@ from vllm.model_executor.utils import set_weight_attrs
...
@@ -19,6 +20,8 @@ from vllm.model_executor.utils import set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.quantization.awq
import
(
is_layer_skipped_awq
)
from
lmslim.layers.fused_moe.fuse_moe_int4
import
fused_experts_w4a16
from
lmslim.layers.fused_moe.fuse_moe_int4
import
fused_experts_w4a16
os
.
environ
[
'W4A16_MOE_CUDA'
]
=
os
.
environ
.
get
(
'W4A16_MOE_CUDA'
,
'0'
)
os
.
environ
[
'W4A16_MOE_CUDA'
]
=
os
.
environ
.
get
(
'W4A16_MOE_CUDA'
,
'0'
)
...
@@ -139,9 +142,9 @@ class MoeWNA16Config(QuantizationConfig):
...
@@ -139,9 +142,9 @@ class MoeWNA16Config(QuantizationConfig):
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
if
isinstance
(
layer
,
LinearBase
):
if
is_layer_skipped_quant
(
prefix
,
self
.
modules_to_not_convert
):
if
is_layer_skipped_quant
(
prefix
,
self
.
modules_to_not_convert
):
return
UnquantizedLinearMethod
()
return
UnquantizedLinearMethod
()
elif
isinstance
(
layer
,
LinearBase
):
# Avoid circular import
# Avoid circular import
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.awq_marlin
import
(
from
vllm.model_executor.layers.quantization.awq_marlin
import
(
...
@@ -167,6 +170,9 @@ class MoeWNA16Config(QuantizationConfig):
...
@@ -167,6 +170,9 @@ class MoeWNA16Config(QuantizationConfig):
else
:
else
:
raise
ValueError
(
"moe_wna16 only support gptq and awq."
)
raise
ValueError
(
"moe_wna16 only support gptq and awq."
)
elif
isinstance
(
layer
,
FusedMoE
):
elif
isinstance
(
layer
,
FusedMoE
):
if
is_layer_skipped_awq
(
prefix
,
getattr
(
self
,
"modules_to_not_convert"
,
[])):
return
UnquantizedFusedMoEMethod
(
layer
.
moe_config
)
return
MoeWNA16Method
(
self
,
layer
.
moe_config
)
return
MoeWNA16Method
(
self
,
layer
.
moe_config
)
return
None
return
None
...
...
vllm/model_executor/layers/quantization/utils/marlin_utils.py
View file @
2cfcb974
...
@@ -177,14 +177,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
...
@@ -177,14 +177,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
# moe marlin requires the activation to be silu
# moe marlin requires the activation to be silu
supports_activation
=
layer
.
activation
==
"silu"
supports_activation
=
layer
.
activation
==
"silu"
#暂时只支持bw
device_name
=
torch
.
cuda
.
get_device_properties
(
torch
.
cuda
.
current_device
()).
name
supports_device
=
"BW"
in
device_name
# gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
# gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
# down: (n, k) = (hidden_size, intermediate_size_per_partition)
# down: (n, k) = (hidden_size, intermediate_size_per_partition)
# moe marlin requires n % 128 == 0 and k % 64 == 0
# moe marlin requires n % 128 == 0 and k % 64 == 0
supports_shape
=
hidden_size
%
128
==
0
and
\
supports_shape
=
hidden_size
%
128
==
0
and
\
intermediate_size_per_partition
%
max
(
64
,
group_size
)
==
0
intermediate_size_per_partition
%
max
(
64
,
group_size
)
==
0
supports_group_size
=
group_size
in
[
-
1
,
32
,
64
,
128
]
#暂时只支持64
supports_group_size
=
group_size
in
[
64
]
return
supports_shape
and
supports_group_size
and
\
return
supports_shape
and
supports_group_size
and
\
supports_router_weight
and
supports_activation
supports_router_weight
and
supports_activation
and
supports_device
def
marlin_make_workspace
(
output_size_per_partition
:
int
,
def
marlin_make_workspace
(
output_size_per_partition
:
int
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment