处理VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD导致的awq推理bug问题

beae085a · yangql · 06185134 · beae085a · beae085a
Commit beae085a authored Mar 24, 2026 by yangql
Showing with 11 additions and 14 deletions

vllm/model_executor/layers/fused_moe/layer.py vllm/model_executor/layers/fused_moe/layer.py +10 -14

vllm/model_executor/layers/quantization/moe_wna16.py vllm/model_executor/layers/quantization/moe_wna16.py +1 -0

No files found.
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -2073,27 +2073,23 @@ class FusedMoE(CustomOp):
                            else 1.0
                        ),
                    )
-                else:
+                elif envs.VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD and shared_output is not None:
                    final_hidden_states = self.quant_method.apply(
                        layer=self,
                        x=x,  # The type signture of this is wrong due to the hack.
                        topk_weights=topk_weights,
                        topk_ids=topk_ids,
                        use_nn_moe=self.use_nn_moe,
-                        shared_output=(
-                            shared_output
-                            if envs.VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
-                            and shared_output is not None
-                            else None
-                        ),
-                        routed_scaling_factor=(
-                            routed_scaling_factor
-                            if envs.VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
-                            and shared_output is not None
-                            else 1.0
-                        ),
+                        shared_output=shared_output,
+                        routed_scaling_factor=routed_scaling_factor,
                    )
-
+                else:
+                    final_hidden_states = self.quant_method.apply(
+                        layer=self,
+                        x=x,  # The type signture of this is wrong due to the hack.
+                        topk_weights=topk_weights,
+                        topk_ids=topk_ids,
+                        use_nn_moe=self.use_nn_moe,)
            if has_separate_shared_experts:
                assert self.shared_experts is not None


--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -381,6 +381,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
        topk_ids: torch.Tensor,
        use_nn_moe: bool | None = False,
        use_fused_gate: bool | None = False,
+        **_
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        from vllm.model_executor.layers.fused_moe import fused_experts