Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ea6ae8cb
Unverified
Commit
ea6ae8cb
authored
May 13, 2025
by
Michael Goin
Committed by
GitHub
May 13, 2025
Browse files
[Bugfix] Fix marlin moe fallback logic for llama4 (#18042)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
2ff297dc
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
5 deletions
+12
-5
tests/weight_loading/models-large.txt
tests/weight_loading/models-large.txt
+2
-1
vllm/model_executor/layers/fused_moe/layer.py
vllm/model_executor/layers/fused_moe/layer.py
+1
-1
vllm/model_executor/layers/quantization/utils/marlin_utils.py
.../model_executor/layers/quantization/utils/marlin_utils.py
+9
-3
No files found.
tests/weight_loading/models-large.txt
View file @
ea6ae8cb
...
@@ -5,3 +5,4 @@ compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
...
@@ -5,3 +5,4 @@ compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main
\ No newline at end of file
vllm/model_executor/layers/fused_moe/layer.py
View file @
ea6ae8cb
...
@@ -480,6 +480,7 @@ class FusedMoE(torch.nn.Module):
...
@@ -480,6 +480,7 @@ class FusedMoE(torch.nn.Module):
self
.
custom_routing_function
=
custom_routing_function
self
.
custom_routing_function
=
custom_routing_function
self
.
scoring_func
=
scoring_func
self
.
scoring_func
=
scoring_func
self
.
e_score_correction_bias
=
e_score_correction_bias
self
.
e_score_correction_bias
=
e_score_correction_bias
self
.
apply_router_weight_on_input
=
apply_router_weight_on_input
self
.
activation
=
activation
self
.
activation
=
activation
if
self
.
scoring_func
!=
"softmax"
and
not
self
.
use_grouped_topk
:
if
self
.
scoring_func
!=
"softmax"
and
not
self
.
use_grouped_topk
:
...
@@ -498,7 +499,6 @@ class FusedMoE(torch.nn.Module):
...
@@ -498,7 +499,6 @@ class FusedMoE(torch.nn.Module):
self
.
quant_method
=
quant_config
.
get_quant_method
(
self
,
prefix
)
self
.
quant_method
=
quant_config
.
get_quant_method
(
self
,
prefix
)
assert
self
.
quant_method
is
not
None
assert
self
.
quant_method
is
not
None
self
.
apply_router_weight_on_input
=
apply_router_weight_on_input
moe_quant_params
=
{
moe_quant_params
=
{
"num_experts"
:
self
.
local_num_experts
,
"num_experts"
:
self
.
local_num_experts
,
"hidden_size"
:
hidden_size
,
"hidden_size"
:
hidden_size
,
...
...
vllm/model_executor/layers/quantization/utils/marlin_utils.py
View file @
ea6ae8cb
...
@@ -171,13 +171,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
...
@@ -171,13 +171,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
->
bool
:
->
bool
:
hidden_size
=
layer
.
hidden_size
hidden_size
=
layer
.
hidden_size
intermediate_size_per_partition
=
layer
.
intermediate_size_per_partition
intermediate_size_per_partition
=
layer
.
intermediate_size_per_partition
# apply_router_weight_on_input is not supported for moe marlin
supports_router_weight
=
not
layer
.
apply_router_weight_on_input
# moe marlin requires the activation to be silu
supports_activation
=
layer
.
activation
==
"silu"
# gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
# gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
# down: (n, k) = (hidden_size, intermediate_size_per_partition)
# down: (n, k) = (hidden_size, intermediate_size_per_partition)
# moe marlin requires n % 128 == 0 and k % 64 == 0
# moe marlin requires n % 128 == 0 and k % 64 == 0
return
hidden_size
%
128
==
0
and
\
supports_shape
=
hidden_size
%
128
==
0
and
\
intermediate_size_per_partition
%
max
(
64
,
group_size
)
==
0
and
\
intermediate_size_per_partition
%
max
(
64
,
group_size
)
==
0
group_size
in
[
-
1
,
32
,
64
,
128
]
supports_group_size
=
group_size
in
[
-
1
,
32
,
64
,
128
]
return
supports_shape
and
supports_group_size
and
\
supports_router_weight
and
supports_activation
def
marlin_make_workspace
(
output_size_per_partition
:
int
,
def
marlin_make_workspace
(
output_size_per_partition
:
int
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment