[Bugfix] Fix marlin moe fallback logic for llama4 (#18042)

Signed-off-by: mgoin <mgoin64@gmail.com>

[Bugfix] Fix marlin moe fallback logic for llama4 (#18042)
Signed-off-by: mgoin <mgoin64@gmail.com>
ea6ae8cb · Michael Goin · GitHub · 2ff297dc · ea6ae8cb · ea6ae8cb
Unverified Commit ea6ae8cb authored May 13, 2025 by Michael Goin Committed by GitHub May 13, 2025
3 changed files
--- a/tests/weight_loading/models-large.txt
+++ b/tests/weight_loading/models-large.txt
@@ -5,3 +5,4 @@ compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
 gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
 gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
 awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
+compressed-tensors, RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16, main
\ No newline at end of file
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -480,6 +480,7 @@ class FusedMoE(torch.nn.Module):
        self.custom_routing_function = custom_routing_function
        self.scoring_func = scoring_func
        self.e_score_correction_bias = e_score_correction_bias
+        self.apply_router_weight_on_input = apply_router_weight_on_input
        self.activation = activation
        if self.scoring_func != "softmax" and not self.use_grouped_topk:
@@ -498,7 +499,6 @@ class FusedMoE(torch.nn.Module):
            self.quant_method = quant_config.get_quant_method(self, prefix)
        assert self.quant_method is not None
-        self.apply_router_weight_on_input = apply_router_weight_on_input
        moe_quant_params = {
            "num_experts": self.local_num_experts,
            "hidden_size": hidden_size,

--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -171,13 +171,19 @@ def check_moe_marlin_supports_layer(layer: LinearBase, group_size: int) \
                                    -> bool:
    hidden_size = layer.hidden_size
    intermediate_size_per_partition = layer.intermediate_size_per_partition
+    # apply_router_weight_on_input is not supported for moe marlin
+    supports_router_weight = not layer.apply_router_weight_on_input
+    # moe marlin requires the activation to be silu
+    supports_activation = layer.activation == "silu"
    # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
    # down: (n, k) = (hidden_size, intermediate_size_per_partition)
    # moe marlin requires n % 128 == 0 and k % 64 == 0
-    return hidden_size % 128 == 0 and \
+    supports_shape = hidden_size % 128 == 0 and \
-        intermediate_size_per_partition % max(64, group_size) == 0 and \
+        intermediate_size_per_partition % max(64, group_size) == 0
-        group_size in [-1, 32, 64, 128]
+    supports_group_size = group_size in [-1, 32, 64, 128]
+    return supports_shape and supports_group_size and \
+        supports_router_weight and supports_activation
 def marlin_make_workspace(output_size_per_partition: int,