Revert "[Bugfix] Fix GPT-OSS AR+NORM fusion (#28841)" (#29483)

Signed-off-by: Huamin Li <3ericli@gmail.com>

Revert "[Bugfix] Fix GPT-OSS AR+NORM fusion (#28841)" (#29483)
Signed-off-by: Huamin Li <3ericli@gmail.com>
70d5953f · Huamin Li · GitHub · 3650a74e · 70d5953f · 70d5953f
Unverified Commit 70d5953f authored Nov 26, 2025 by Huamin Li Committed by GitHub Nov 26, 2025
4 changed files
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -972,7 +972,6 @@ steps:
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - vllm/model_executor/layers/fused_moe/layer.py
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py

--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -111,17 +111,6 @@ if current_platform.is_cuda():
                async_tp=96,  # MLP is MoE, half the fusions of dense
            ),
        ),
-        ModelBackendTestCase(
-            model_name="openai/gpt-oss-20b",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER,
-            matches=Matches(
-                attention_fusion=0,
-                allreduce_fusion=49,
-                sequence_parallel=49,
-                async_tp=48,
-            ),
-        ),
    ]
 elif current_platform.is_rocm():

--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -131,7 +131,7 @@ class SymmMemCommunicator:
            return None
        if out is None:
            out = torch.empty_like(inp)
-        self.buffer[: inp.numel()].copy_(inp.reshape(-1))
+        self.buffer[: inp.numel()].copy_(inp.view(-1))
        # Determine which algorithm to use
        use_multimem = False

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1690,10 +1690,6 @@ class FusedMoE(CustomOp):
            )
        def reduce_output(states: torch.Tensor) -> torch.Tensor:
-            # Slice before all_reduce to enable possible fusion
-            if self.hidden_size != og_hidden_states:
-                states = states[..., :og_hidden_states]
            if (
                not self.is_sequence_parallel
                and not self.use_dp_chunking
@@ -1716,12 +1712,11 @@ class FusedMoE(CustomOp):
            if self.zero_expert_num is not None and self.zero_expert_num > 0:
                assert isinstance(fused_output, tuple)
                fused_output, zero_expert_result = fused_output
-                return (
+                return (reduce_output(fused_output) + zero_expert_result)[
-                    reduce_output(fused_output)
+                    ..., :og_hidden_states
-                    + zero_expert_result[..., :og_hidden_states]
+                ]
-                )
            else:
-                return reduce_output(fused_output)
+                return reduce_output(fused_output)[..., :og_hidden_states]
        else:
            if current_platform.is_tpu():
                # TODO: Once the OOM issue for the TPU backend is resolved, we
@@ -1734,8 +1729,8 @@ class FusedMoE(CustomOp):
                    hidden_states, router_logits, self.layer_name
                )
            return (
-                reduce_output(shared_output),
+                reduce_output(shared_output)[..., :og_hidden_states],
-                reduce_output(fused_output),
+                reduce_output(fused_output)[..., :og_hidden_states],
            )
    def forward_cuda(