add shared_output and routed_scaling_factor of w4a8

b33ff2d6 · zhuwenwen · 49810c37 · b33ff2d6 · b33ff2d6
Commit b33ff2d6 authored Sep 25, 2025 by zhuwenwen
2 changed files
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1771,9 +1771,8 @@ def fused_experts_impl(
                                use_nn_moe=use_nn_moe)

        if envs.VLLM_USE_LIGHTOP and not dpsk_fp16_quick: 
-            if shared_output is not None:
-                op.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
-                        out_hidden_states[begin_chunk_idx:end_chunk_idx], shared_output[begin_chunk_idx:end_chunk_idx], None, routed_scaling_factor)
+            op.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
+                    out_hidden_states[begin_chunk_idx:end_chunk_idx], shared_output[begin_chunk_idx:end_chunk_idx], None, routed_scaling_factor)
        # else:
        #     ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.size()),
        #                 out_hidden_states[begin_chunk_idx:end_chunk_idx]) 

--- a/vllm/model_executor/layers/quantization/slimquant_w4a8.py
+++ b/vllm/model_executor/layers/quantization/slimquant_w4a8.py
@@ -358,6 +358,7 @@ class SlimQuantW4A8Int8MoEMethod:
        use_nn_moe: Optional[bool] = False,
        routed_scaling_factor: Optional[float] = None,
        use_fused_gate: Optional[bool] = False,
+        shared_output: Optional[torch.Tensor] = None,
        **_  
    ) -> torch.Tensor:
        from vllm.model_executor.layers.fused_moe import fused_experts
@@ -398,4 +399,6 @@ class SlimQuantW4A8Int8MoEMethod:
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
            use_nn_moe=use_nn_moe,
+            shared_output=shared_output,
+            routed_scaling_factor=routed_scaling_factor,
        )