Tiny cleanup fp4 gemm calls (#11537)

065ce815 · fzyzcjy · GitHub · 8e51049f · 065ce815
Unverified Commit 065ce815 authored Oct 14, 2025 by fzyzcjy Committed by GitHub Oct 13, 2025
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 19 deletions

python/sglang/srt/layers/quantization/modelopt_quant.py python/sglang/srt/layers/quantization/modelopt_quant.py +9 -19

No files found.
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -852,17 +852,6 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
        if enable_flashinfer_fp4_gemm:
            w = layer.weight.T
            w_scale_interleaved = layer.weight_scale_interleaved.T
-        if USE_CUTLASS_BACKEND_FOR_FP4_GEMM:
-            out = fp4_gemm(
-                x_fp4,
-                w,
-                x_scale_interleaved,
-                w_scale_interleaved,
-                layer.alpha,
-                output_dtype,
-                backend="cutlass",
-            )
-        else:
        out = fp4_gemm(
            x_fp4,
            w,
@@ -870,6 +859,7 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
            w_scale_interleaved,
            layer.alpha,
            output_dtype,
+            **(dict(backend="cutlass") if USE_CUTLASS_BACKEND_FOR_FP4_GEMM else dict()),
        )
        if bias is not None:
            out = out + bias