Use sgl fp4 quant kernel by default (#12482)

59cce594 · Qiaolin Yu · GitHub · 795e98f8 · 59cce594
Unverified Commit 59cce594 authored Oct 31, 2025 by Qiaolin Yu Committed by GitHub Oct 31, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 1 deletion

python/sglang/srt/layers/quantization/modelopt_quant.py python/sglang/srt/layers/quantization/modelopt_quant.py +5 -1

No files found.
--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional
 import torch
 from torch.nn.parameter import Parameter

+from python.sglang.srt.utils.common import is_sm120_supported
 from sglang.srt.distributed import get_tp_group
 from sglang.srt.layers.dp_attention import get_dp_global_num_tokens, get_local_dp_buffer
 from sglang.srt.layers.moe import (
@@ -51,7 +52,10 @@ if TYPE_CHECKING:
    from sglang.srt.single_batch_overlap import DownGemmOverlapArgs

 try:
-    from flashinfer import fp4_quantize
+    if is_sm120_supported():
+        from flashinfer import fp4_quantize
+    else:
+        from sgl_kernel import scaled_fp4_quant as fp4_quantize
 except ImportError:
    fp4_quantize = None