Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
9a6ad891
Unverified
Commit
9a6ad891
authored
Apr 30, 2025
by
Yineng Zhang
Committed by
GitHub
Apr 30, 2025
Browse files
chore: upgrade sgl-kernel 0.1.1 (#5933)
parent
d353d08b
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
20 additions
and
14 deletions
+20
-14
.github/workflows/vllm-dependency-test.yml
.github/workflows/vllm-dependency-test.yml
+1
-1
python/pyproject.toml
python/pyproject.toml
+1
-1
python/sglang/srt/entrypoints/engine.py
python/sglang/srt/entrypoints/engine.py
+1
-1
python/sglang/srt/layers/quantization/__init__.py
python/sglang/srt/layers/quantization/__init__.py
+2
-2
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+6
-3
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+8
-5
scripts/ci_install_dependency.sh
scripts/ci_install_dependency.sh
+1
-1
No files found.
.github/workflows/vllm-dependency-test.yml
View file @
9a6ad891
...
...
@@ -30,7 +30,7 @@ jobs:
-
name
:
Install dependencies
run
:
|
bash scripts/ci_install_dependency.sh
pip install "vllm
>=0.6.4.post1,<=0.7.2
"
pip install "vllm
==0.8.4
"
pip install "bitsandbytes>=0.44.0"
-
name
:
Run VLLM dependency tests
...
...
python/pyproject.toml
View file @
9a6ad891
...
...
@@ -47,7 +47,7 @@ runtime_common = [
srt
=
[
"sglang[runtime_common]"
,
"sgl-kernel==0.1.
0
"
,
"sgl-kernel==0.1.
1
"
,
"flashinfer_python==0.2.5"
,
"torch==2.6.0"
,
"torchvision==0.21.0"
,
...
...
python/sglang/srt/entrypoints/engine.py
View file @
9a6ad891
...
...
@@ -461,7 +461,7 @@ def _set_envs_and_config(server_args: ServerArgs):
if
_is_cuda
:
assert_pkg_version
(
"sgl-kernel"
,
"0.1.
0
"
,
"0.1.
1
"
,
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`"
,
)
...
...
python/sglang/srt/layers/quantization/__init__.py
View file @
9a6ad891
...
...
@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
if
quantization
in
VLLM_QUANTIZATION_METHODS
and
not
VLLM_AVAILABLE
:
raise
ValueError
(
f
"
{
quantization
}
quantization requires some operators from vllm. "
"Pleaes install vllm by `pip install vllm==0.
7.2
`"
"Pleaes install vllm by `pip install vllm==0.
8.4
`"
)
return
QUANTIZATION_METHODS
[
quantization
]
...
...
@@ -310,7 +310,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
if
correction_bias
is
not
None
:
if
not
has_correction_bias
:
raise
ValueError
(
"Please increase the version of your vllm. Try `pip install vllm==0.
7.2
`"
"Please increase the version of your vllm. Try `pip install vllm==0.
8.4
`"
)
kwargs
[
"e_score_correction_bias"
]
=
correction_bias
return
original_apply
(
**
kwargs
)
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
9a6ad891
...
...
@@ -79,6 +79,7 @@ from sglang.srt.utils import (
get_available_gpu_memory
,
get_bool_env_var
,
init_custom_process_group
,
is_ampere_with_cuda_12_3
,
is_cuda
,
is_fa3_default_architecture
,
is_flashinfer_available
,
...
...
@@ -246,7 +247,7 @@ class ModelRunner:
if
not
self
.
use_mla_backend
:
# MHA architecture
if
(
is_hopper_with_cuda_12_3
()
(
is_ampere_with_cuda_12_3
()
or
is_hopper_with_cuda_12_3
()
)
and
is_no_spec_infer_or_topk_one
(
server_args
)
and
is_fa3_default_architecture
(
self
.
model_config
.
hf_config
)
):
...
...
@@ -927,8 +928,10 @@ class ModelRunner:
self
.
attn_backend
=
FlashMLABackend
(
self
)
elif
self
.
server_args
.
attention_backend
==
"fa3"
:
assert
torch
.
cuda
.
get_device_capability
()[
0
]
>=
9
,
(
"FlashAttention v3 Backend requires SM>=90. "
assert
(
torch
.
cuda
.
get_device_capability
()[
0
]
==
8
and
not
self
.
use_mla_backend
)
or
torch
.
cuda
.
get_device_capability
()[
0
]
==
9
,
(
"FlashAttention v3 Backend requires SM>=80 and SM<=90. "
"Please use `--attention-backend flashinfer`."
)
from
sglang.srt.layers.attention.flashattention_backend
import
(
...
...
python/sglang/srt/utils.py
View file @
9a6ad891
...
...
@@ -1905,13 +1905,16 @@ def fast_topk(values, topk, dim):
return
torch
.
topk
(
values
,
topk
,
dim
=
dim
)
def
is_hopper_with_cuda_12_3
(
):
def
_check
(
cc_major
):
if
not
is_cuda
():
return
False
is_hopper
=
torch
.
cuda
.
get_device_capability
()[
0
]
==
9
cuda_version
=
torch
.
version
.
cuda
.
split
(
"."
)
is_cuda_compatible
=
int
(
cuda_version
[
0
])
==
12
and
int
(
cuda_version
[
1
])
>=
3
return
is_hopper
and
is_cuda_compatible
return
torch
.
cuda
.
get_device_capability
()[
0
]
==
cc_major
and
tuple
(
map
(
int
,
torch
.
version
.
cuda
.
split
(
"."
)[:
2
])
)
>=
(
12
,
3
)
is_ampere_with_cuda_12_3
=
lambda
:
_check
(
8
)
is_hopper_with_cuda_12_3
=
lambda
:
_check
(
9
)
def
get_free_port
():
...
...
scripts/ci_install_dependency.sh
View file @
9a6ad891
...
...
@@ -16,7 +16,7 @@ rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
pip
install
--upgrade
pip
# Install sgl-kernel
pip
install
sgl-kernel
==
0.1.
0
--no-cache-dir
pip
install
sgl-kernel
==
0.1.
1
--no-cache-dir
# Install the main package
pip
install
-e
"python[all]"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment