Remove contiguous before Flashinfer groupwise fp8 gemm (#6804)

6a47b730 · Baizhou Zhang · GitHub · c429919d · 6a47b730
Unverified Commit 6a47b730 authored Jun 01, 2025 by Baizhou Zhang Committed by GitHub Jun 01, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 4 deletions

python/sglang/srt/layers/quantization/fp8_utils.py python/sglang/srt/layers/quantization/fp8_utils.py +6 -4

No files found.
--- a/python/sglang/srt/layers/quantization/fp8_utils.py
+++ b/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -166,11 +166,13 @@ def flashinfer_gemm_w8a8_block_fp8_linear(
        input_2d, block_size[1], column_major_scales=False
    )

-    x_scale_input = x_scale.T.contiguous()
-    weight_scale_input = weight_scale.T.contiguous()
-
    output = gemm_fp8_nt_groupwise(
-        q_input, weight, x_scale_input, weight_scale_input, out_dtype=input_2d.dtype
+        q_input,
+        weight,
+        x_scale,
+        weight_scale,
+        scale_major_mode="K",
+        out_dtype=input_2d.dtype,
    )

    if bias is not None: