Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e5f51b79
Commit
e5f51b79
authored
Sep 11, 2025
by
yangql
Browse files
新增dpsk-v3.1-awq的支持
parent
5c288d91
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
26 additions
and
13 deletions
+26
-13
vllm/model_executor/layers/quantization/awq.py
vllm/model_executor/layers/quantization/awq.py
+2
-2
vllm/model_executor/layers/quantization/awq_marlin.py
vllm/model_executor/layers/quantization/awq_marlin.py
+7
-3
vllm/model_executor/layers/quantization/moe_wna16.py
vllm/model_executor/layers/quantization/moe_wna16.py
+10
-5
vllm/model_executor/layers/quantization/utils/marlin_utils.py
.../model_executor/layers/quantization/utils/marlin_utils.py
+7
-3
No files found.
vllm/model_executor/layers/quantization/awq.py
View file @
e5f51b79
...
...
@@ -130,7 +130,7 @@ class AWQConfig(QuantizationConfig):
return
"awq"
def
get_supported_act_dtypes
(
self
)
->
list
[
torch
.
dtype
]:
return
[
torch
.
half
]
return
[
torch
.
half
,
torch
.
bfloat16
]
@
classmethod
def
get_min_capability
(
cls
)
->
int
:
...
...
@@ -293,7 +293,7 @@ class AWQLinearMethod(LinearMethodBase):
pad_group
=
2
dim_n
=
layer
.
scales
.
data
.
shape
[
1
]
dim_k
=
layer
.
qweight
.
data
.
shape
[
0
]
_qw
,
_sz
=
ops
.
convert_s4
(
layer
.
qweight
,
layer
.
qzeros
,
layer
.
scales
,
int
(
group_size
))
_qw
,
_sz
=
ops
.
convert_s4
(
layer
.
qweight
,
layer
.
qzeros
,
layer
.
scales
.
to
(
torch
.
float16
)
,
int
(
group_size
))
sz
=
ops
.
sz_permute
(
_sz
).
reshape
(
-
1
,
dim_n
)
sz
=
sz
.
reshape
(
dim_n
,
-
1
)
_qw
=
_qw
.
reshape
(
dim_n
,
-
1
)
...
...
vllm/model_executor/layers/quantization/awq_marlin.py
View file @
e5f51b79
...
...
@@ -10,7 +10,8 @@ import vllm.model_executor.layers.fused_moe # noqa
from
vllm
import
_custom_ops
as
ops
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.fused_moe.layer
import
(
FusedMoE
,
FusedMoEMethodBase
,
FusedMoeWeightScaleSupported
)
FusedMoE
,
FusedMoEMethodBase
,
FusedMoeWeightScaleSupported
,
UnquantizedFusedMoEMethod
)
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
LinearMethodBase
,
UnquantizedLinearMethod
,
set_weight_attrs
)
...
...
@@ -140,6 +141,9 @@ class AWQMarlinConfig(QuantizationConfig):
self
.
full_config
).
get_quant_method
(
layer
,
prefix
)
return
AWQMarlinLinearMethod
(
self
)
elif
isinstance
(
layer
,
FusedMoE
):
if
is_layer_skipped_awq
(
prefix
,
getattr
(
self
,
"modules_to_not_convert"
,
[])):
return
UnquantizedFusedMoEMethod
(
layer
.
moe_config
)
from
vllm.model_executor.layers.quantization.moe_wna16
import
(
MoeWNA16Config
)
if
not
check_moe_marlin_supports_layer
(
layer
,
self
.
group_size
):
...
...
@@ -436,7 +440,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
# Why does this take the intermediate size for size_k?
marlin_w13_scales
=
marlin_moe_permute_scales
(
s
=
layer
.
w13_scales
,
s
=
layer
.
w13_scales
.
to
(
torch
.
float16
)
,
size_k
=
layer
.
intermediate_size_per_partition
,
size_n
=
layer
.
w13_scales
.
shape
[
2
],
group_size
=
self
.
quant_config
.
group_size
,
...
...
@@ -445,7 +449,7 @@ class AWQMoEMethod(FusedMoEMethodBase):
#replace_parameter(layer, "w13_scales", marlin_w13_scales)
marlin_w2_scales
=
marlin_moe_permute_scales
(
s
=
layer
.
w2_scales
,
s
=
layer
.
w2_scales
.
to
(
torch
.
float16
)
,
size_k
=
layer
.
intermediate_size_per_partition
,
size_n
=
layer
.
w2_scales
.
shape
[
2
],
group_size
=
self
.
quant_config
.
group_size
,
...
...
vllm/model_executor/layers/quantization/moe_wna16.py
View file @
e5f51b79
...
...
@@ -7,7 +7,8 @@ import torch
import
os
from
vllm.distributed
import
get_tensor_model_parallel_rank
,
get_tp_group
from
vllm.model_executor.layers.fused_moe.layer
import
(
FusedMoE
,
FusedMoEMethodBase
,
FusedMoeWeightScaleSupported
)
FusedMoE
,
FusedMoEMethodBase
,
FusedMoeWeightScaleSupported
,
UnquantizedFusedMoEMethod
)
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
UnquantizedLinearMethod
)
from
vllm.model_executor.layers.quantization
import
QuantizationMethods
...
...
@@ -18,7 +19,8 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.quantization.awq
import
(
is_layer_skipped_awq
)
from
lmslim.layers.fused_moe.fuse_moe_int4
import
fused_experts_w4a16
os
.
environ
[
'W4A16_MOE_CUDA'
]
=
os
.
environ
.
get
(
'W4A16_MOE_CUDA'
,
'0'
)
...
...
@@ -139,9 +141,9 @@ class MoeWNA16Config(QuantizationConfig):
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Optional
[
"QuantizeMethodBase"
]:
if
is
_layer_skipped_quant
(
prefix
,
self
.
modules_to_not_convert
):
return
UnquantizedLinearMethod
()
elif
isinstance
(
layer
,
LinearBase
):
if
is
instance
(
layer
,
LinearBase
):
if
is_layer_skipped_quant
(
prefix
,
self
.
modules_to_not_convert
):
return
UnquantizedLinearMethod
()
# Avoid circular import
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.awq_marlin
import
(
...
...
@@ -167,6 +169,9 @@ class MoeWNA16Config(QuantizationConfig):
else
:
raise
ValueError
(
"moe_wna16 only support gptq and awq."
)
elif
isinstance
(
layer
,
FusedMoE
):
if
is_layer_skipped_awq
(
prefix
,
getattr
(
self
,
"modules_to_not_convert"
,
[])):
return
UnquantizedFusedMoEMethod
(
layer
.
moe_config
)
return
MoeWNA16Method
(
self
)
return
None
...
...
vllm/model_executor/layers/quantization/utils/marlin_utils.py
View file @
e5f51b79
...
...
@@ -176,15 +176,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
supports_router_weight
=
not
layer
.
apply_router_weight_on_input
# moe marlin requires the activation to be silu
supports_activation
=
layer
.
activation
==
"silu"
#暂时只支持bw
device_name
=
torch
.
cuda
.
get_device_properties
(
torch
.
cuda
.
current_device
()).
name
supports_device
=
"BW"
in
device_name
# gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
# down: (n, k) = (hidden_size, intermediate_size_per_partition)
# moe marlin requires n % 128 == 0 and k % 64 == 0
supports_shape
=
hidden_size
%
128
==
0
and
\
intermediate_size_per_partition
%
max
(
64
,
group_size
)
==
0
supports_group_size
=
group_size
in
[
-
1
,
32
,
64
,
128
]
#暂时只支持64
supports_group_size
=
group_size
in
[
64
]
return
supports_shape
and
supports_group_size
and
\
supports_router_weight
and
supports_activation
supports_router_weight
and
supports_activation
and
supports_device
def
marlin_make_workspace
(
output_size_per_partition
:
int
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment