Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b3312eec
Commit
b3312eec
authored
Jun 16, 2025
by
yangql
Browse files
分离fuse moe awq算子到lmslim上
parent
483acdc4
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
34 additions
and
6 deletions
+34
-6
vllm/model_executor/layers/quantization/moe_wna16.py
vllm/model_executor/layers/quantization/moe_wna16.py
+27
-1
vllm/model_executor/layers/quantization/utils/fused_moe_cuda.py
...odel_executor/layers/quantization/utils/fused_moe_cuda.py
+7
-5
No files found.
vllm/model_executor/layers/quantization/moe_wna16.py
View file @
b3312eec
...
...
@@ -17,7 +17,10 @@ from vllm.model_executor.utils import set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
lmslim.layers.fused_moe.fuse_moe_int4
import
fused_experts_w4a16
os
.
environ
[
'W4A16_MOE_CUDA'
]
=
os
.
environ
.
get
(
'W4A16_MOE_CUDA'
,
'0'
)
os
.
environ
[
'W4A16_MOE_LMSLIM'
]
=
os
.
environ
.
get
(
'W4A16_MOE_LMSLIM'
,
'1'
)
if
os
.
environ
[
'W4A16_MOE_CUDA'
]
==
'1'
:
from
vllm.model_executor.layers.quantization.utils.fused_moe_cuda
import
fused_experts_cuda
...
...
@@ -180,7 +183,11 @@ class MoeWNA16Method(FusedMoEMethodBase):
def
__init__
(
self
,
quant_config
:
MoeWNA16Config
):
self
.
quant_config
=
quant_config
self
.
use_w4a16_moe_sz
=
os
.
environ
.
get
(
'AWQ_MOE_SZ'
)
==
'1'
self
.
use_w4a16_cuda
=
0
self
.
use_moe_lmslim
=
0
if
self
.
use_w4a16_moe_sz
:
self
.
use_w4a16_cuda
=
os
.
environ
[
'W4A16_MOE_CUDA'
]
==
'1'
self
.
use_moe_lmslim
=
os
.
environ
[
'W4A16_MOE_LMSLIM'
]
==
"1"
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
num_experts
:
int
,
hidden_size
:
int
,
intermediate_size_per_partition
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
):
...
...
@@ -352,6 +359,24 @@ class MoeWNA16Method(FusedMoEMethodBase):
weight_bits
=
self
.
quant_config
.
weight_bits
has_zp
=
self
.
quant_config
.
has_zp
if
self
.
use_moe_lmslim
:
return
fused_experts_w4a16
(
x
,
layer
.
w13_qweight
,
layer
.
w2_qweight
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
inplace
=
True
,
activation
=
activation
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
use_int4_w4a16
=
True
,
global_num_experts
=
global_num_experts
,
expert_map
=
expert_map
,
w1_scale
=
layer
.
w13_scales
,
w2_scale
=
layer
.
w2_scales
,
block_shape
=
[
0
,
layer
.
group_size
])
if
self
.
use_w4a16_cuda
:
m
=
topk_ids
.
shape
[
0
]
if
m
<=
512
:
...
...
@@ -380,6 +405,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
inplace
=
True
,
activation
=
activation
,
use_int4_w4a16
=
weight_bits
==
4
,
use_int8_w8a16
=
weight_bits
==
8
,
global_num_experts
=
global_num_experts
,
...
...
vllm/model_executor/layers/quantization/utils/fused_moe_cuda.py
View file @
b3312eec
...
...
@@ -17,8 +17,8 @@ from vllm.platforms import current_platform
from
vllm.utils
import
direct_register_custom_op
from
vllm.model_executor.layers.fused_moe.moe_align_block_size
import
(
moe_align_block_size
)
from
grouped_gemm
import
moe_gemm_w4a16
from
grouped_gemm.ops
import
permute
as
permute_topK
,
unpermute
as
unpermute_topK
from
grouped_gemm
_int4
import
moe_gemm_w4a16
from
grouped_gemm
_int4
.ops
import
permute
as
permute_topK
,
unpermute
as
unpermute_topK
import
torch.nn.functional
as
F
logger
=
init_logger
(
__name__
)
device_name
=
current_platform
.
get_device_name
()
...
...
@@ -315,7 +315,7 @@ def fused_experts_impl_cuda(hidden_states: torch.Tensor,
num_tokens_post_padded
,
# 实际专家数
expert_ids
,
# expert_id_vec
w1_scale
,
# scale_zero
64
,
# group_size
block_shape
[
1
]
,
# group_size
topk
=
topk
,
# topk
mode
=
mode_1
)
# mode=gemm1_mode
...
...
@@ -329,10 +329,12 @@ def fused_experts_impl_cuda(hidden_states: torch.Tensor,
expert_ids
,
# expert_id_vec
w2_scale
,
# scale_zero
topk_weights
,
# topk_weights
64
,
# group_size
block_shape
[
1
]
,
# group_size
topk
=
topk
,
# topk
mode
=
mode_2
)
# mode=gemm2_mode
ops
.
moe_sum
(
intermediate_cache3
.
view
(
*
intermediate_cache3
.
shape
),
out_hidden_states
)
return
out_hidden_states
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment