Use FlashInfer's TRTLLM FP8 Blockscale GEMM (#8588)

8723b4f1 · Elfie Guo · GitHub · 62f99e08 · 8723b4f1
Unverified Commit 8723b4f1 authored Aug 12, 2025 by Elfie Guo Committed by GitHub Aug 12, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

python/sglang/srt/layers/quantization/fp8_utils.py python/sglang/srt/layers/quantization/fp8_utils.py +3 -3

No files found.
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -161,16 +161,16 @@ def flashinfer_gemm_w8a8_block_fp8_linear(
    output_shape = [*input.shape[:-1], weight.shape[0]]
    q_input, x_scale = sglang_per_token_group_quant_fp8(
-        input_2d, block_size[1], column_major_scales=False
+        input_2d, block_size[1], column_major_scales=True
    )
+    # TRTLLM requires column-major scaling factors
    output = gemm_fp8_nt_groupwise(
        q_input,
        weight,
        x_scale,
        weight_scale,
-        scale_major_mode="K",
        out_dtype=input_2d.dtype,
+        backend="trtllm",
    )
    if bias is not None: