[Auto Sync] Update modelopt_quant.py (20250920) (#10688)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

[Auto Sync] Update modelopt_quant.py (20250920) (#10688)
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
f1d78923 · Yineng Zhang · GitHub · 7c876de7 · f1d78923
Unverified Commit f1d78923 authored Sep 20, 2025 by Yineng Zhang Committed by GitHub Sep 20, 2025
Show whitespace changes
Inline Side-by-side

Showing with 22 additions and 8 deletions

python/sglang/srt/layers/quantization/modelopt_quant.py python/sglang/srt/layers/quantization/modelopt_quant.py +22 -8

No files found.
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -77,6 +77,9 @@ logger = logging.getLogger(__name__)
 CUTEDSL_MOE_SCALAR_INPUT_SCALE = get_bool_env_var(
    "SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true"
 )
+USE_CUTLASS_BACKEND_FOR_FP4_GEMM = get_bool_env_var(
+    "SGLANG_USE_CUTLASS_BACKEND_FOR_FP4_GEMM"
+)
 # Supported activation schemes for the current configuration
 ACTIVATION_SCHEMES = ["static"]
@@ -844,6 +847,17 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
        if enable_flashinfer_fp4_gemm:
            w = layer.weight.T
            w_scale_interleaved = layer.weight_scale_interleaved.T
+        if USE_CUTLASS_BACKEND_FOR_FP4_GEMM:
+            out = fp4_gemm(
+                x_fp4,
+                w,
+                x_scale_interleaved,
+                w_scale_interleaved,
+                layer.alpha,
+                output_dtype,
+                backend="cutlass",
+            )
+        else:
            out = fp4_gemm(
                x_fp4,
                w,