Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6adf9d12
Commit
6adf9d12
authored
Mar 27, 2026
by
flyingdown
Browse files
use tunning w4a16 moe
parent
54e03934
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
1 deletion
+10
-1
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+10
-1
No files found.
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
6adf9d12
...
@@ -36,6 +36,7 @@ try:
...
@@ -36,6 +36,7 @@ try:
per_token_group_quant_int8
,
per_token_quant_int8
)
per_token_group_quant_int8
,
per_token_quant_int8
)
from
lmslim.layers.fused_moe.fuse_moe_int8
import
(
fused_experts_impl_int8
,
get_w8a8moe_json
)
from
lmslim.layers.fused_moe.fuse_moe_int8
import
(
fused_experts_impl_int8
,
get_w8a8moe_json
)
from
lmslim.layers.fused_moe.fuse_moe_w4a8
import
fused_experts_impl_w4a8
from
lmslim.layers.fused_moe.fuse_moe_w4a8
import
fused_experts_impl_w4a8
from
lmslim.layers.fused_moe.fuse_moe_w4a16
import
get_moe_triton_config_w4a16
except
Exception
:
except
Exception
:
print
(
"INFO: Please install lmslim if you want to infer the quantitative model of moe.
\n
"
)
print
(
"INFO: Please install lmslim if you want to infer the quantitative model of moe.
\n
"
)
...
@@ -1984,7 +1985,15 @@ def fused_experts_impl(
...
@@ -1984,7 +1985,15 @@ def fused_experts_impl(
use_nn_moe
=
use_nn_moe
,
use_nn_moe
=
use_nn_moe
,
)
)
config
=
get_config_func
(
M
)
# config = get_config_func(M)
_
,
N1
,
_
=
w1
.
shape
_
,
N2
,
K2
=
w2
.
shape
config
,
_
,
status
=
get_moe_triton_config_w4a16
(
M
,
E
,
N1
,
N2
,
K2
*
2
,
top_k_num
,
block_shape
[
1
],
hidden_states
.
dtype
)
# debug
# print(f"M:{M}, E:{E}, N1:{N1}, N2:{N2}, K2:{K2}, top_k_num:{top_k_num}, block_shape:{block_shape}, dtype:{hidden_states.dtype}, status:{status}")
assert
status
,
"config not found."
# We can reuse the memory between these because by the time we need
# We can reuse the memory between these because by the time we need
# cache3, we're done with cache1
# cache3, we're done with cache1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment