Unverified Commit 89caf7a3 authored by Trevor Morris's avatar Trevor Morris Committed by GitHub
Browse files

[bugfix] Apply routed scaling factor to cutlass_fused_experts_fp8 (#8688)

parent b27b1191
...@@ -1039,7 +1039,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): ...@@ -1039,7 +1039,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8 from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
topk_weights, topk_ids, _ = topk_output topk_weights, topk_ids, _ = topk_output
return cutlass_fused_experts_fp8( output = cutlass_fused_experts_fp8(
x, x,
layer.w13_weight.transpose(1, 2), layer.w13_weight.transpose(1, 2),
layer.w2_weight.transpose(1, 2), layer.w2_weight.transpose(1, 2),
...@@ -1062,6 +1062,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): ...@@ -1062,6 +1062,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
self.problem_sizes2, self.problem_sizes2,
use_fp8_blockscale=True, use_fp8_blockscale=True,
) )
# TODO: Fuse into select_experts
if routed_scaling_factor is not None:
output *= routed_scaling_factor
return output
# Expert fusion with FP8 quantization # Expert fusion with FP8 quantization
return fused_experts( return fused_experts(
x, x,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment