Unverified Commit 6716b417 authored by Baizhou Zhang's avatar Baizhou Zhang Committed by GitHub
Browse files

Update default settings for blackwell (#7023)

parent 1c8b42c8
...@@ -28,7 +28,7 @@ RUN git clone --depth=1 https://github.com/sgl-project/sglang.git \ ...@@ -28,7 +28,7 @@ RUN git clone --depth=1 https://github.com/sgl-project/sglang.git \
RUN pip3 install nvidia-nccl-cu12==2.26.2.post1 --force-reinstall --no-deps --break-system-packages RUN pip3 install nvidia-nccl-cu12==2.26.2.post1 --force-reinstall --no-deps --break-system-packages
RUN pip3 install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl --break-system-packages RUN pip3 install flashinfer_python==0.2.6.post1 --break-system-packages
ENV DEBIAN_FRONTEND=interactive ENV DEBIAN_FRONTEND=interactive
......
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 4
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 4
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"32": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3
},
"96": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 64,
"num_warps": 4,
"num_stages": 3
},
"256": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"512": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 4,
"num_stages": 3
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 4
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 8,
"num_stages": 4
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 16,
"num_warps": 8,
"num_stages": 4
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 8,
"num_stages": 4
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 32,
"num_warps": 8,
"num_stages": 4
}
}
...@@ -52,6 +52,7 @@ from sglang.srt.layers.quantization.deep_gemm import ( ...@@ -52,6 +52,7 @@ from sglang.srt.layers.quantization.deep_gemm import (
) )
from sglang.srt.layers.sampler import Sampler from sglang.srt.layers.sampler import Sampler
from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
from sglang.srt.layers.utils import is_sm100_supported
from sglang.srt.lora.lora_manager import LoRAManager from sglang.srt.lora.lora_manager import LoRAManager
from sglang.srt.managers.eplb_manager import EPLBManager from sglang.srt.managers.eplb_manager import EPLBManager
from sglang.srt.managers.expert_distribution import ( from sglang.srt.managers.expert_distribution import (
...@@ -314,7 +315,8 @@ class ModelRunner: ...@@ -314,7 +315,8 @@ class ModelRunner:
1.2 In other cases, we will use flashinfer if available, otherwise use triton. 1.2 In other cases, we will use flashinfer if available, otherwise use triton.
2. Models with MLA Architecture and using FA3 2. Models with MLA Architecture and using FA3
2.1 We will use FA3 backend on hopper. 2.1 We will use FA3 backend on hopper.
2.2 Otherwise, we will use triton backend. 2.2 We will use Flashinfer backend on blackwell.
2.3 Otherwise, we will use triton backend.
""" """
if not self.use_mla_backend: if not self.use_mla_backend:
...@@ -335,6 +337,8 @@ class ModelRunner: ...@@ -335,6 +337,8 @@ class ModelRunner:
# MLA architecture # MLA architecture
if is_hopper_with_cuda_12_3(): if is_hopper_with_cuda_12_3():
server_args.attention_backend = "fa3" server_args.attention_backend = "fa3"
elif is_sm100_supported():
server_args.attention_backend = "flashinfer"
elif _is_hip: elif _is_hip:
head_num = self.model_config.get_num_kv_heads(self.tp_size) head_num = self.model_config.get_num_kv_heads(self.tp_size)
# TODO current aiter only support head number 16 or 128 head number # TODO current aiter only support head number 16 or 128 head number
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment