Unverified Commit fcb9f879 authored by Ming Yang's avatar Ming Yang Committed by GitHub
Browse files

[Bugfix] Correct per_act_token in CompressedTensorsW8A8Fp8MoECutlassM… (#20937)


Signed-off-by: default avatarMing Yang <minos.future@gmail.com>
parent 3ed94f9d
...@@ -929,10 +929,8 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod): ...@@ -929,10 +929,8 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
scoring_func=scoring_func, scoring_func=scoring_func,
e_score_correction_bias=e_score_correction_bias) e_score_correction_bias=e_score_correction_bias)
a1_scale = layer.w13_input_scale per_act_token = (
a2_scale = layer.w2_input_scale self.input_quant.strategy == QuantizationStrategy.TOKEN)
per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
a2_scale.numel() != 1 if a2_scale is not None else False)
if self.fused_experts is None: if self.fused_experts is None:
# If no modular kernel is provided, use cutlass_moe_fp8 # If no modular kernel is provided, use cutlass_moe_fp8
...@@ -950,8 +948,8 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod): ...@@ -950,8 +948,8 @@ class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
expert_map=None if self.disable_expert_map else expert_map, expert_map=None if self.disable_expert_map else expert_map,
w1_scale=layer.w13_weight_scale, w1_scale=layer.w13_weight_scale,
w2_scale=layer.w2_weight_scale, w2_scale=layer.w2_weight_scale,
a1_scale=a1_scale, a1_scale=layer.w13_input_scale,
a2_scale=a2_scale, a2_scale=layer.w2_input_scale,
) )
else: else:
return self.fused_experts( return self.fused_experts(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment