[Bugfix] Fix precision corruption when shared_experts_stream=None (#28942)

Signed-off-by: zhyajie <yajizhan@amd.com> Co-authored-by: zhyajie <yajizhan@amd.com>

[Bugfix] Fix precision corruption when shared_experts_stream=None (#28942)
Signed-off-by: zhyajie <yajizhan@amd.com> Co-authored-by: zhyajie <yajizhan@amd.com>
9d2d5612 · 杰兮 · GitHub · fe69f331 · 9d2d5612 · 9d2d5612
Unverified Commit 9d2d5612 authored Nov 20, 2025 by 杰兮 Committed by GitHub Nov 19, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 6 deletions

vllm/model_executor/layers/fused_moe/layer.py vllm/model_executor/layers/fused_moe/layer.py +7 -4

vllm/utils/torch_utils.py vllm/utils/torch_utils.py +1 -2

No files found.
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -371,8 +371,8 @@ class FusedMoE(CustomOp):
            logger.info_once("Disabling MoE shared_experts cuda stream")
            self.shared_experts_stream = None
        else:
-            # TODO(rob): enable shared expert overlap with non-cuda.
+            # TODO(rob): enable shared expert overlap with non-cuda-alike.
-            # aux_stream() returns None on non-cuda platforms.
+            # aux_stream() returns None on non-cuda-alike platforms.
            self.shared_experts_stream = aux_stream()
            if self.shared_experts_stream is not None:
                logger.info_once("Enabled separate cuda stream for MoE shared_experts")
@@ -1865,6 +1865,11 @@ class FusedMoE(CustomOp):
                hidden_states_combined, router_logits = get_ep_group().dispatch(
                    hidden_states, router_logits, self.is_sequence_parallel
                )
+            # Run shared experts before matrix multiply.
+            # because matrix multiply maybe modify the hidden_states.
+            if has_separate_shared_experts and not use_shared_experts_stream:
+                assert self.shared_experts is not None
+                shared_output = self.shared_experts(hidden_states)
            # Matrix multiply.
            final_hidden_states = self.quant_method.apply(
@@ -1908,8 +1913,6 @@ class FusedMoE(CustomOp):
                        # conflict with the main stream
                        shared_output = self.shared_experts(hidden_states_clone)
                    current_stream().wait_stream(self.shared_experts_stream)
-                else:
-                    shared_output = self.shared_experts(hidden_states)
                final_hidden_states = (
                    shared_output,

--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -426,8 +426,7 @@ def aux_stream() -> torch.cuda.Stream | None:
    from vllm.platforms import current_platform
-    # TODO: validate this works properly on ROCm platform.
+    if _aux_stream is None and current_platform.is_cuda_alike():
-    if _aux_stream is None and current_platform.is_cuda():
        _aux_stream = torch.cuda.Stream()
    return _aux_stream