Fix quant kernel accuracy issue (#2865)

f3516c28 · Ke Bao · GitHub · 17de02f9 · f3516c28
Unverified Commit f3516c28 authored Jan 13, 2025 by Ke Bao Committed by GitHub Jan 13, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

python/sglang/srt/layers/quantization/int8_kernel.py python/sglang/srt/layers/quantization/int8_kernel.py +2 -1

No files found.
--- a/python/sglang/srt/layers/quantization/int8_kernel.py
+++ b/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -22,7 +22,8 @@ def _per_token_quant_int8(
    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
    scale_x = absmax / 127
-    x_q = tl.extra.cuda.libdevice.round(x / scale_x).to(tl.int8)
+    x_q = x * (127 / absmax)
+    x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
    tl.store(scale_ptr + row_id, scale_x)