"examples/profiling/gt_bench.sh" did not exist on "0d6504434befdf609d34709891eecf85f27e0934"
Unverified Commit 59cce594 authored by Qiaolin Yu's avatar Qiaolin Yu Committed by GitHub
Browse files

Use sgl fp4 quant kernel by default (#12482)

parent 795e98f8
...@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional ...@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional
import torch import torch
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from python.sglang.srt.utils.common import is_sm120_supported
from sglang.srt.distributed import get_tp_group from sglang.srt.distributed import get_tp_group
from sglang.srt.layers.dp_attention import get_dp_global_num_tokens, get_local_dp_buffer from sglang.srt.layers.dp_attention import get_dp_global_num_tokens, get_local_dp_buffer
from sglang.srt.layers.moe import ( from sglang.srt.layers.moe import (
...@@ -51,7 +52,10 @@ if TYPE_CHECKING: ...@@ -51,7 +52,10 @@ if TYPE_CHECKING:
from sglang.srt.single_batch_overlap import DownGemmOverlapArgs from sglang.srt.single_batch_overlap import DownGemmOverlapArgs
try: try:
from flashinfer import fp4_quantize if is_sm120_supported():
from flashinfer import fp4_quantize
else:
from sgl_kernel import scaled_fp4_quant as fp4_quantize
except ImportError: except ImportError:
fp4_quantize = None fp4_quantize = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment