Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6372a1f3
"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "84e23d103d3483f944780d0d42bcf0993fd27e3a"
Commit
6372a1f3
authored
Sep 07, 2025
by
王敏
Browse files
support w4a8 ep
parent
ffa325e0
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
81 additions
and
64 deletions
+81
-64
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
+2
-2
vllm/model_executor/layers/quantization/slimquant_w4a8.py
vllm/model_executor/layers/quantization/slimquant_w4a8.py
+79
-62
No files found.
vllm/model_executor/layers/fused_moe/ep_moe/layer.py
View file @
6372a1f3
...
@@ -34,7 +34,7 @@ class UnquantizedEPGroupedGemmMethod(UnquantizedFusedMoEMethod):
...
@@ -34,7 +34,7 @@ class UnquantizedEPGroupedGemmMethod(UnquantizedFusedMoEMethod):
self
.
rocm_aiter_moe_enabled
=
False
# is_rocm_aiter_moe_enabled()
self
.
rocm_aiter_moe_enabled
=
False
# is_rocm_aiter_moe_enabled()
def
apply
(
def
apply
_ep
(
self
,
self
,
layer
:
torch
.
nn
.
Module
,
layer
:
torch
.
nn
.
Module
,
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
...
@@ -254,7 +254,7 @@ class EPMoE(FusedMoE):
...
@@ -254,7 +254,7 @@ class EPMoE(FusedMoE):
)
)
# Matrix multiply.
# Matrix multiply.
expert_output
=
self
.
quant_method
.
apply
(
expert_output
=
self
.
quant_method
.
apply
_ep
(
layer
=
self
,
layer
=
self
,
hidden_states
=
dispatched_input
,
hidden_states
=
dispatched_input
,
tokens_per_expert
=
tokens_per_expert
tokens_per_expert
=
tokens_per_expert
...
...
vllm/model_executor/layers/quantization/slimquant_w4a8.py
View file @
6372a1f3
...
@@ -21,7 +21,10 @@ from vllm.utils import W8a8GetCacheJSON
...
@@ -21,7 +21,10 @@ from vllm.utils import W8a8GetCacheJSON
import
os
import
os
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
try
:
from
lmslim.layers.fused_moe.fuse_moe_w4a8
import
fused_experts_impl_w4a8_ep
except
Exception
:
print
(
"INFO: Please install lmslim if you want to infer the quantitative model of moe.
\n
"
)
W8A8_TRITONJSON
=
W8a8GetCacheJSON
()
W8A8_TRITONJSON
=
W8a8GetCacheJSON
()
def
baseline_scaled_mm
(
a
:
torch
.
Tensor
,
def
baseline_scaled_mm
(
a
:
torch
.
Tensor
,
...
@@ -328,65 +331,79 @@ class SlimQuantW4A8Int8MoEMethod:
...
@@ -328,65 +331,79 @@ class SlimQuantW4A8Int8MoEMethod:
layer
.
w2_weight_scale
.
data
,
requires_grad
=
False
layer
.
w2_weight_scale
.
data
,
requires_grad
=
False
)
)
def
apply
(
def
apply_ep
(
#dp+ep
self
,
self
,
layer
:
torch
.
nn
.
Module
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
router_logits
:
torch
.
Tensor
,
tokens_per_expert
:
torch
.
Tensor
,
top_k
:
int
,
)
->
torch
.
Tensor
:
renormalize
:
bool
,
return
fused_experts_impl_w4a8_ep
(
hidden_states
,
use_grouped_topk
:
bool
=
False
,
layer
.
w13_weight
,
topk_group
:
Optional
[
int
]
=
None
,
layer
.
w2_weight
,
num_expert_group
:
Optional
[
int
]
=
None
,
layer
.
w13_weight_scale
,
global_num_experts
:
int
=
-
1
,
layer
.
w2_weight_scale
,
expert_map
:
Optional
[
torch
.
Tensor
]
=
None
,
tokens_per_expert
)
custom_routing_function
:
Optional
[
Callable
]
=
None
,
scoring_func
:
str
=
"softmax"
,
e_score_correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
def
apply
(
# tp
apply_router_weight_on_input
:
bool
=
False
,
self
,
activation
:
str
=
"silu"
,
layer
:
torch
.
nn
.
Module
,
enable_eplb
:
bool
=
False
,
x
:
torch
.
Tensor
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
router_logits
:
torch
.
Tensor
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
top_k
:
int
,
use_fused_gate
:
Optional
[
bool
]
=
False
,
renormalize
:
bool
,
**
_
use_grouped_topk
:
bool
=
False
,
)
->
torch
.
Tensor
:
topk_group
:
Optional
[
int
]
=
None
,
from
vllm.model_executor.layers.fused_moe
import
fused_experts
num_expert_group
:
Optional
[
int
]
=
None
,
if
enable_eplb
:
global_num_experts
:
int
=
-
1
,
raise
NotImplementedError
(
expert_map
:
Optional
[
torch
.
Tensor
]
=
None
,
"EPLB not supported for `SlimQuantW4A8Int8MoEMethod` yet."
)
custom_routing_function
:
Optional
[
Callable
]
=
None
,
# Expert selection
scoring_func
:
str
=
"softmax"
,
topk_weights
,
topk_ids
=
FusedMoE
.
select_experts
(
e_score_correction_bias
:
Optional
[
torch
.
Tensor
]
=
None
,
hidden_states
=
x
,
apply_router_weight_on_input
:
bool
=
False
,
router_logits
=
router_logits
,
activation
:
str
=
"silu"
,
use_grouped_topk
=
use_grouped_topk
,
enable_eplb
:
bool
=
False
,
top_k
=
top_k
,
use_nn_moe
:
Optional
[
bool
]
=
False
,
renormalize
=
renormalize
,
routed_scaling_factor
:
Optional
[
float
]
=
None
,
topk_group
=
topk_group
,
use_fused_gate
:
Optional
[
bool
]
=
False
,
num_expert_group
=
num_expert_group
,
**
_
custom_routing_function
=
custom_routing_function
,
)
->
torch
.
Tensor
:
scoring_func
=
scoring_func
,
from
vllm.model_executor.layers.fused_moe
import
fused_experts
e_score_correction_bias
=
e_score_correction_bias
,
if
enable_eplb
:
routed_scaling_factor
=
routed_scaling_factor
,
raise
NotImplementedError
(
use_fused_gate
=
use_fused_gate
"EPLB not supported for `SlimQuantW4A8Int8MoEMethod` yet."
)
)
# Expert selection
topk_weights
,
topk_ids
=
FusedMoE
.
select_experts
(
hidden_states
=
x
,
router_logits
=
router_logits
,
use_grouped_topk
=
use_grouped_topk
,
top_k
=
top_k
,
renormalize
=
renormalize
,
topk_group
=
topk_group
,
num_expert_group
=
num_expert_group
,
custom_routing_function
=
custom_routing_function
,
scoring_func
=
scoring_func
,
e_score_correction_bias
=
e_score_correction_bias
,
routed_scaling_factor
=
routed_scaling_factor
,
use_fused_gate
=
use_fused_gate
)
return
fused_experts
(
return
fused_experts
(
x
,
x
,
layer
.
w13_weight
,
layer
.
w13_weight
,
layer
.
w2_weight
,
layer
.
w2_weight
,
topk_weights
=
topk_weights
,
topk_weights
=
topk_weights
,
topk_ids
=
topk_ids
,
topk_ids
=
topk_ids
,
inplace
=
True
,
inplace
=
True
,
use_int4_w4a8
=
True
,
use_int4_w4a8
=
True
,
per_channel_quant
=
True
,
per_channel_quant
=
True
,
activation
=
activation
,
activation
=
activation
,
expert_map
=
expert_map
,
expert_map
=
expert_map
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
global_num_experts
=
global_num_experts
,
global_num_experts
=
global_num_experts
,
w1_scale
=
(
layer
.
w13_weight_scale
),
w1_scale
=
(
layer
.
w13_weight_scale
),
w2_scale
=
(
layer
.
w2_weight_scale
),
w2_scale
=
(
layer
.
w2_weight_scale
),
a1_scale
=
layer
.
w13_input_scale
,
a1_scale
=
layer
.
w13_input_scale
,
a2_scale
=
layer
.
w2_input_scale
,
a2_scale
=
layer
.
w2_input_scale
,
use_nn_moe
=
use_nn_moe
,
use_nn_moe
=
use_nn_moe
,
)
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment