Unverified Commit 9a6ad891 authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

chore: upgrade sgl-kernel 0.1.1 (#5933)

parent d353d08b
...@@ -30,7 +30,7 @@ jobs: ...@@ -30,7 +30,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
pip install "vllm>=0.6.4.post1,<=0.7.2" pip install "vllm==0.8.4"
pip install "bitsandbytes>=0.44.0" pip install "bitsandbytes>=0.44.0"
- name: Run VLLM dependency tests - name: Run VLLM dependency tests
......
...@@ -47,7 +47,7 @@ runtime_common = [ ...@@ -47,7 +47,7 @@ runtime_common = [
srt = [ srt = [
"sglang[runtime_common]", "sglang[runtime_common]",
"sgl-kernel==0.1.0", "sgl-kernel==0.1.1",
"flashinfer_python==0.2.5", "flashinfer_python==0.2.5",
"torch==2.6.0", "torch==2.6.0",
"torchvision==0.21.0", "torchvision==0.21.0",
......
...@@ -461,7 +461,7 @@ def _set_envs_and_config(server_args: ServerArgs): ...@@ -461,7 +461,7 @@ def _set_envs_and_config(server_args: ServerArgs):
if _is_cuda: if _is_cuda:
assert_pkg_version( assert_pkg_version(
"sgl-kernel", "sgl-kernel",
"0.1.0", "0.1.1",
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`", "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
) )
......
...@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: ...@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE: if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
raise ValueError( raise ValueError(
f"{quantization} quantization requires some operators from vllm. " f"{quantization} quantization requires some operators from vllm. "
"Pleaes install vllm by `pip install vllm==0.7.2`" "Pleaes install vllm by `pip install vllm==0.8.4`"
) )
return QUANTIZATION_METHODS[quantization] return QUANTIZATION_METHODS[quantization]
...@@ -310,7 +310,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"): ...@@ -310,7 +310,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
if correction_bias is not None: if correction_bias is not None:
if not has_correction_bias: if not has_correction_bias:
raise ValueError( raise ValueError(
"Please increase the version of your vllm. Try `pip install vllm==0.7.2`" "Please increase the version of your vllm. Try `pip install vllm==0.8.4`"
) )
kwargs["e_score_correction_bias"] = correction_bias kwargs["e_score_correction_bias"] = correction_bias
return original_apply(**kwargs) return original_apply(**kwargs)
......
...@@ -79,6 +79,7 @@ from sglang.srt.utils import ( ...@@ -79,6 +79,7 @@ from sglang.srt.utils import (
get_available_gpu_memory, get_available_gpu_memory,
get_bool_env_var, get_bool_env_var,
init_custom_process_group, init_custom_process_group,
is_ampere_with_cuda_12_3,
is_cuda, is_cuda,
is_fa3_default_architecture, is_fa3_default_architecture,
is_flashinfer_available, is_flashinfer_available,
...@@ -246,7 +247,7 @@ class ModelRunner: ...@@ -246,7 +247,7 @@ class ModelRunner:
if not self.use_mla_backend: if not self.use_mla_backend:
# MHA architecture # MHA architecture
if ( if (
is_hopper_with_cuda_12_3() (is_ampere_with_cuda_12_3() or is_hopper_with_cuda_12_3())
and is_no_spec_infer_or_topk_one(server_args) and is_no_spec_infer_or_topk_one(server_args)
and is_fa3_default_architecture(self.model_config.hf_config) and is_fa3_default_architecture(self.model_config.hf_config)
): ):
...@@ -927,8 +928,10 @@ class ModelRunner: ...@@ -927,8 +928,10 @@ class ModelRunner:
self.attn_backend = FlashMLABackend(self) self.attn_backend = FlashMLABackend(self)
elif self.server_args.attention_backend == "fa3": elif self.server_args.attention_backend == "fa3":
assert torch.cuda.get_device_capability()[0] >= 9, ( assert (
"FlashAttention v3 Backend requires SM>=90. " torch.cuda.get_device_capability()[0] == 8 and not self.use_mla_backend
) or torch.cuda.get_device_capability()[0] == 9, (
"FlashAttention v3 Backend requires SM>=80 and SM<=90. "
"Please use `--attention-backend flashinfer`." "Please use `--attention-backend flashinfer`."
) )
from sglang.srt.layers.attention.flashattention_backend import ( from sglang.srt.layers.attention.flashattention_backend import (
......
...@@ -1905,13 +1905,16 @@ def fast_topk(values, topk, dim): ...@@ -1905,13 +1905,16 @@ def fast_topk(values, topk, dim):
return torch.topk(values, topk, dim=dim) return torch.topk(values, topk, dim=dim)
def is_hopper_with_cuda_12_3(): def _check(cc_major):
if not is_cuda(): if not is_cuda():
return False return False
is_hopper = torch.cuda.get_device_capability()[0] == 9 return torch.cuda.get_device_capability()[0] == cc_major and tuple(
cuda_version = torch.version.cuda.split(".") map(int, torch.version.cuda.split(".")[:2])
is_cuda_compatible = int(cuda_version[0]) == 12 and int(cuda_version[1]) >= 3 ) >= (12, 3)
return is_hopper and is_cuda_compatible
is_ampere_with_cuda_12_3 = lambda: _check(8)
is_hopper_with_cuda_12_3 = lambda: _check(9)
def get_free_port(): def get_free_port():
......
...@@ -16,7 +16,7 @@ rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel* ...@@ -16,7 +16,7 @@ rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
pip install --upgrade pip pip install --upgrade pip
# Install sgl-kernel # Install sgl-kernel
pip install sgl-kernel==0.1.0 --no-cache-dir pip install sgl-kernel==0.1.1 --no-cache-dir
# Install the main package # Install the main package
pip install -e "python[all]" pip install -e "python[all]"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment