[Bugfix][MoE] Unpad routed output before shared expert add [Fixes #35949] (#40794)

Signed-off-by: Netanel Haber <nhaber@nvidia.com>

[Bugfix][MoE] Unpad routed output before shared expert add [Fixes #35949] (#40794)
Signed-off-by: Netanel Haber <nhaber@nvidia.com>
e8eb0490 · Netanel Haber · GitHub · e8ee2a78 · e8eb0490
Unverified Commit e8eb0490 authored Apr 24, 2026 by Netanel Haber Committed by GitHub Apr 24, 2026
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 0 deletions

vllm/model_executor/layers/fused_moe/runner/moe_runner.py vllm/model_executor/layers/fused_moe/runner/moe_runner.py +6 -0

No files found.
--- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py
+++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py
@@ -550,10 +550,14 @@ class MoERunner(MoERunnerInterface):
            hidden_states
        )

+        # Record before `_maybe_pad_hidden_states` pads activations to match
+        # `moe_config.hidden_dim`, e.g. after `align_trtllm_fp4_moe_hidden_dim_for_fi`
+        routed_hidden_dim = hidden_states.shape[-1]
        hidden_states, og_hidden_dim = self._maybe_pad_hidden_states(
            shared_experts_input,
            hidden_states,
        )
+        hidden_dim_was_padded = hidden_states.shape[-1] > routed_hidden_dim

        result = self._forward_entry(
            hidden_states,
@@ -573,6 +577,8 @@ class MoERunner(MoERunnerInterface):

        # Extract outputs from result
        shared_output, fused_output = _unpack(result)
+        if hidden_dim_was_padded:
+            fused_output = fused_output[..., :routed_hidden_dim]

        # If combine kernel already reduced fused, reduce shared to match.
        # See note above re: the two all-reduce points.