Fix FP4 inference corruption issue in glm4.5-air model (#9346)

70bb066e · Azure · GitHub · 2c4b4b78 · 70bb066e
Unverified Commit 70bb066e authored Aug 21, 2025 by Azure Committed by GitHub Aug 20, 2025
Show whitespace changes
Inline Side-by-side

Showing with 24 additions and 9 deletions

sgl-kernel/python/sgl_kernel/gemm.py sgl-kernel/python/sgl_kernel/gemm.py +24 -9

No files found.
--- a/sgl-kernel/python/sgl_kernel/gemm.py
+++ b/sgl-kernel/python/sgl_kernel/gemm.py
@@ -205,6 +205,12 @@ def scaled_fp4_quant(
    rounded_m = ((m + 128 - 1) // 128) * 128
    scale_n = n // block_size
    rounded_n = ((scale_n + 4 - 1) // 4) * 4
+    # padded part should be zeroed out
+    if rounded_n > scale_n:
+        output_scale = torch.zeros(
+            (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
+        )
+    else:
        output_scale = torch.empty(
            (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
        )
@@ -338,6 +344,15 @@ def scaled_fp4_experts_quant(
    output = torch.empty(
        m_numtopk, k // 2, device=input_tensor.device, dtype=torch.uint8
    )
+    # padded part should be zeroed out
+    if padded_k > scales_k:
+        output_scales = torch.zeros(
+            MAX_TOKENS_PER_EXPERT * topk,
+            padded_k,
+            dtype=torch.int32,
+            device=input_tensor.device,
+        )
+    else:
        output_scales = torch.empty(
            MAX_TOKENS_PER_EXPERT * topk,
            padded_k,