Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
9a6ad891
"README.md.origin" did not exist on "a037a9c453b52ebb749428f65534384364aa09e2"
Unverified
Commit
9a6ad891
authored
Apr 30, 2025
by
Yineng Zhang
Committed by
GitHub
Apr 30, 2025
Browse files
chore: upgrade sgl-kernel 0.1.1 (#5933)
parent
d353d08b
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
20 additions
and
14 deletions
+20
-14
.github/workflows/vllm-dependency-test.yml
.github/workflows/vllm-dependency-test.yml
+1
-1
python/pyproject.toml
python/pyproject.toml
+1
-1
python/sglang/srt/entrypoints/engine.py
python/sglang/srt/entrypoints/engine.py
+1
-1
python/sglang/srt/layers/quantization/__init__.py
python/sglang/srt/layers/quantization/__init__.py
+2
-2
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+6
-3
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+8
-5
scripts/ci_install_dependency.sh
scripts/ci_install_dependency.sh
+1
-1
No files found.
.github/workflows/vllm-dependency-test.yml
View file @
9a6ad891
...
...
@@ -30,7 +30,7 @@ jobs:
-
name
:
Install dependencies
run
:
|
bash scripts/ci_install_dependency.sh
pip install "vllm
>=0.6.4.post1,<=0.7.2
"
pip install "vllm
==0.8.4
"
pip install "bitsandbytes>=0.44.0"
-
name
:
Run VLLM dependency tests
...
...
python/pyproject.toml
View file @
9a6ad891
...
...
@@ -47,7 +47,7 @@ runtime_common = [
srt
=
[
"sglang[runtime_common]"
,
"sgl-kernel==0.1.
0
"
,
"sgl-kernel==0.1.
1
"
,
"flashinfer_python==0.2.5"
,
"torch==2.6.0"
,
"torchvision==0.21.0"
,
...
...
python/sglang/srt/entrypoints/engine.py
View file @
9a6ad891
...
...
@@ -461,7 +461,7 @@ def _set_envs_and_config(server_args: ServerArgs):
if
_is_cuda
:
assert_pkg_version
(
"sgl-kernel"
,
"0.1.
0
"
,
"0.1.
1
"
,
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`"
,
)
...
...
python/sglang/srt/layers/quantization/__init__.py
View file @
9a6ad891
...
...
@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
if
quantization
in
VLLM_QUANTIZATION_METHODS
and
not
VLLM_AVAILABLE
:
raise
ValueError
(
f
"
{
quantization
}
quantization requires some operators from vllm. "
"Pleaes install vllm by `pip install vllm==0.
7.2
`"
"Pleaes install vllm by `pip install vllm==0.
8.4
`"
)
return
QUANTIZATION_METHODS
[
quantization
]
...
...
@@ -310,7 +310,7 @@ def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
if
correction_bias
is
not
None
:
if
not
has_correction_bias
:
raise
ValueError
(
"Please increase the version of your vllm. Try `pip install vllm==0.
7.2
`"
"Please increase the version of your vllm. Try `pip install vllm==0.
8.4
`"
)
kwargs
[
"e_score_correction_bias"
]
=
correction_bias
return
original_apply
(
**
kwargs
)
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
9a6ad891
...
...
@@ -79,6 +79,7 @@ from sglang.srt.utils import (
get_available_gpu_memory
,
get_bool_env_var
,
init_custom_process_group
,
is_ampere_with_cuda_12_3
,
is_cuda
,
is_fa3_default_architecture
,
is_flashinfer_available
,
...
...
@@ -246,7 +247,7 @@ class ModelRunner:
if
not
self
.
use_mla_backend
:
# MHA architecture
if
(
is_hopper_with_cuda_12_3
()
(
is_ampere_with_cuda_12_3
()
or
is_hopper_with_cuda_12_3
()
)
and
is_no_spec_infer_or_topk_one
(
server_args
)
and
is_fa3_default_architecture
(
self
.
model_config
.
hf_config
)
):
...
...
@@ -927,8 +928,10 @@ class ModelRunner:
self
.
attn_backend
=
FlashMLABackend
(
self
)
elif
self
.
server_args
.
attention_backend
==
"fa3"
:
assert
torch
.
cuda
.
get_device_capability
()[
0
]
>=
9
,
(
"FlashAttention v3 Backend requires SM>=90. "
assert
(
torch
.
cuda
.
get_device_capability
()[
0
]
==
8
and
not
self
.
use_mla_backend
)
or
torch
.
cuda
.
get_device_capability
()[
0
]
==
9
,
(
"FlashAttention v3 Backend requires SM>=80 and SM<=90. "
"Please use `--attention-backend flashinfer`."
)
from
sglang.srt.layers.attention.flashattention_backend
import
(
...
...
python/sglang/srt/utils.py
View file @
9a6ad891
...
...
@@ -1905,13 +1905,16 @@ def fast_topk(values, topk, dim):
return
torch
.
topk
(
values
,
topk
,
dim
=
dim
)
def
is_hopper_with_cuda_12_3
(
):
def
_check
(
cc_major
):
if
not
is_cuda
():
return
False
is_hopper
=
torch
.
cuda
.
get_device_capability
()[
0
]
==
9
cuda_version
=
torch
.
version
.
cuda
.
split
(
"."
)
is_cuda_compatible
=
int
(
cuda_version
[
0
])
==
12
and
int
(
cuda_version
[
1
])
>=
3
return
is_hopper
and
is_cuda_compatible
return
torch
.
cuda
.
get_device_capability
()[
0
]
==
cc_major
and
tuple
(
map
(
int
,
torch
.
version
.
cuda
.
split
(
"."
)[:
2
])
)
>=
(
12
,
3
)
is_ampere_with_cuda_12_3
=
lambda
:
_check
(
8
)
is_hopper_with_cuda_12_3
=
lambda
:
_check
(
9
)
def
get_free_port
():
...
...
scripts/ci_install_dependency.sh
View file @
9a6ad891
...
...
@@ -16,7 +16,7 @@ rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
pip
install
--upgrade
pip
# Install sgl-kernel
pip
install
sgl-kernel
==
0.1.
0
--no-cache-dir
pip
install
sgl-kernel
==
0.1.
1
--no-cache-dir
# Install the main package
pip
install
-e
"python[all]"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment