Unverified Commit 59cce594 authored by Qiaolin Yu's avatar Qiaolin Yu Committed by GitHub
Browse files

Use sgl fp4 quant kernel by default (#12482)

parent 795e98f8
......@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional
import torch
from torch.nn.parameter import Parameter
from python.sglang.srt.utils.common import is_sm120_supported
from sglang.srt.distributed import get_tp_group
from sglang.srt.layers.dp_attention import get_dp_global_num_tokens, get_local_dp_buffer
from sglang.srt.layers.moe import (
......@@ -51,7 +52,10 @@ if TYPE_CHECKING:
from sglang.srt.single_batch_overlap import DownGemmOverlapArgs
try:
from flashinfer import fp4_quantize
if is_sm120_supported():
from flashinfer import fp4_quantize
else:
from sgl_kernel import scaled_fp4_quant as fp4_quantize
except ImportError:
fp4_quantize = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment