Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
55e03b10
Unverified
Commit
55e03b10
authored
Jun 23, 2025
by
Lianmin Zheng
Committed by
GitHub
Jun 23, 2025
Browse files
Fix a bug in BatchTokenIDOut & Misc style and dependency updates (#7457)
parent
8aa68ed5
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
37 additions
and
32 deletions
+37
-32
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+5
-1
python/pyproject.toml
python/pyproject.toml
+6
-8
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+1
-0
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+8
-1
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+1
-1
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+2
-3
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+3
-7
sgl-kernel/CMakeLists.txt
sgl-kernel/CMakeLists.txt
+10
-10
sgl-kernel/python/sgl_kernel/sampling.py
sgl-kernel/python/sgl_kernel/sampling.py
+1
-1
No files found.
.github/workflows/pr-test.yml
View file @
55e03b10
...
@@ -113,6 +113,10 @@ jobs:
...
@@ -113,6 +113,10 @@ jobs:
github.event.pull_request.draft ==
false
github.event.pull_request.draft ==
false
needs
:
[
unit-test-frontend
,
unit-test-backend-2-gpu
]
needs
:
[
unit-test-frontend
,
unit-test-backend-2-gpu
]
runs-on
:
8-gpu-runner
runs-on
:
8-gpu-runner
strategy
:
fail-fast
:
false
matrix
:
part
:
[
0
,
1
]
steps
:
steps
:
-
name
:
Checkout code
-
name
:
Checkout code
uses
:
actions/checkout@v4
uses
:
actions/checkout@v4
...
@@ -125,7 +129,7 @@ jobs:
...
@@ -125,7 +129,7 @@ jobs:
timeout-minutes
:
20
timeout-minutes
:
20
run
:
|
run
:
|
cd test/srt
cd test/srt
python3 run_suite.py --suite per-commit-8-gpu
python3 run_suite.py --suite per-commit-8-gpu
--auto-partition-id ${{ matrix.part }} --auto-partition-size 2
performance-test-1-gpu-part-1
:
performance-test-1-gpu-part-1
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
...
...
python/pyproject.toml
View file @
55e03b10
...
@@ -29,6 +29,7 @@ runtime_common = [
...
@@ -29,6 +29,7 @@ runtime_common = [
"msgspec"
,
"msgspec"
,
"ninja"
,
"ninja"
,
"orjson"
,
"orjson"
,
"outlines==0.1.11"
,
"packaging"
,
"packaging"
,
"partial_json_parser"
,
"partial_json_parser"
,
"pillow"
,
"pillow"
,
...
@@ -50,13 +51,12 @@ runtime_common = [
...
@@ -50,13 +51,12 @@ runtime_common = [
srt
=
[
srt
=
[
"sglang[runtime_common]"
,
"sglang[runtime_common]"
,
"sgl-kernel==0.1.9"
,
"sgl-kernel==0.1.9"
,
"flashinfer_python==0.2.6.post1"
,
"torch==2.7.1"
,
"torch==2.7.1"
,
"torchaudio==2.7.1"
,
"torchaudio==2.7.1"
,
"torchvision==0.22.1"
,
"torchvision==0.22.1"
,
"cuda-python"
,
"cuda-python"
,
"outlines>=0.0.44,<=0.1.11"
,
"einops"
,
"einops"
,
"flashinfer_python==0.2.6.post1"
,
]
]
blackwell
=
[
blackwell
=
[
...
@@ -66,7 +66,6 @@ blackwell = [
...
@@ -66,7 +66,6 @@ blackwell = [
"torchaudio==2.7.1"
,
"torchaudio==2.7.1"
,
"torchvision==0.22.1"
,
"torchvision==0.22.1"
,
"cuda-python"
,
"cuda-python"
,
"outlines>=0.0.44,<=0.1.11"
,
"einops"
,
"einops"
,
"flashinfer_python==0.2.6.post1"
,
"flashinfer_python==0.2.6.post1"
,
]
]
...
@@ -77,23 +76,22 @@ srt_hip = [
...
@@ -77,23 +76,22 @@ srt_hip = [
"sglang[runtime_common]"
,
"sglang[runtime_common]"
,
"torch"
,
"torch"
,
"vllm==0.6.7.dev2"
,
"vllm==0.6.7.dev2"
,
"outlines==0.1.11"
]
]
# xpu is not enabled in public vllm and torch whl,
# xpu is not enabled in public vllm and torch whl,
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
srt_xpu
=
["sglang[runtime_common]
"
, "
outlines>=
0.0.44
,
<=
0.1.11
"
]
srt_xpu
=
["sglang[runtime_common]"]
# For Intel Gaudi(device : hpu) follow the installation guide
# For Intel Gaudi(device : hpu) follow the installation guide
# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
srt_hpu
=
["sglang[runtime_common]
"
, "
outlines>=
0.0.44
,
<=
0.1.11
"
]
srt_hpu
=
["sglang[runtime_common]"]
# CPU: currently, there are no pre-built vllm wheels for CPU.
# CPU: currently, there are no pre-built vllm wheels for CPU.
# To install vllm for CPU, please follow the instruction here:
# To install vllm for CPU, please follow the instruction here:
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html
srt_cpu
=
["sglang[runtime_common]
",
"
outlines>=
0.0.44
,
<=
0.1.11
",
"
einops
"]
srt_cpu
=
["sglang[runtime_common]
", "
einops
"]
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
# https://vllm-ascend.readthedocs.io/en/latest/installation.html
srt_npu
=
["sglang[runtime_common]
"
, "
outlines>=
0.0.44
,
<=
0.1.11
"
]
srt_npu
=
["sglang[runtime_common]"]
openai
=
[
"openai>=1.0"
,
"tiktoken"
]
openai
=
[
"openai>=1.0"
,
"tiktoken"
]
anthropic
=
["anthropic>=0.20.0"]
anthropic
=
["anthropic>=0.20.0"]
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
55e03b10
...
@@ -788,6 +788,7 @@ class Req:
...
@@ -788,6 +788,7 @@ class Req:
self
.
multimodal_inputs
=
None
self
.
multimodal_inputs
=
None
self
.
grammar
=
None
self
.
grammar
=
None
self
.
origin_input_ids
=
[
0
]
# set it to one token to skip the long prefill
self
.
origin_input_ids
=
[
0
]
# set it to one token to skip the long prefill
self
.
return_logprob
=
False
self
.
finished_reason
=
FINISH_ABORT
(
self
.
finished_reason
=
FINISH_ABORT
(
error_msg
,
HTTPStatus
.
BAD_REQUEST
,
"BadRequestError"
error_msg
,
HTTPStatus
.
BAD_REQUEST
,
"BadRequestError"
)
)
...
...
python/sglang/srt/managers/scheduler.py
View file @
55e03b10
...
@@ -1374,7 +1374,14 @@ class Scheduler(
...
@@ -1374,7 +1374,14 @@ class Scheduler(
)
)
raise
ValueError
(
msg
)
raise
ValueError
(
msg
)
if
len
(
self
.
req_to_token_pool
.
free_slots
)
!=
self
.
req_to_token_pool
.
size
:
if
self
.
disaggregation_mode
==
DisaggregationMode
.
DECODE
:
req_total_size
=
(
self
.
req_to_token_pool
.
size
+
self
.
req_to_token_pool
.
pre_alloc_size
)
else
:
req_total_size
=
self
.
req_to_token_pool
.
size
if
len
(
self
.
req_to_token_pool
.
free_slots
)
!=
req_total_size
:
msg
=
(
msg
=
(
"req_to_token_pool memory leak detected!"
"req_to_token_pool memory leak detected!"
f
"available_size=
{
len
(
self
.
req_to_token_pool
.
free_slots
)
}
, "
f
"available_size=
{
len
(
self
.
req_to_token_pool
.
free_slots
)
}
, "
...
...
python/sglang/srt/managers/tokenizer_manager.py
View file @
55e03b10
...
@@ -1226,7 +1226,7 @@ class TokenizerManager:
...
@@ -1226,7 +1226,7 @@ class TokenizerManager:
state
.
last_output_offset
=
len
(
state
.
output_ids
)
state
.
last_output_offset
=
len
(
state
.
output_ids
)
else
:
else
:
state
.
output_ids
.
extend
(
recv_obj
.
output_ids
[
i
])
state
.
output_ids
.
extend
(
recv_obj
.
output_ids
[
i
])
output_token_ids
=
state
.
output_ids
output_token_ids
=
state
.
output_ids
.
copy
()
out_dict
=
{
out_dict
=
{
"output_ids"
:
output_token_ids
,
"output_ids"
:
output_token_ids
,
...
...
python/sglang/srt/server_args.py
View file @
55e03b10
...
@@ -1723,9 +1723,8 @@ class PortArgs:
...
@@ -1723,9 +1723,8 @@ class PortArgs:
dist_init_host
,
dist_init_port
=
dist_init_addr
dist_init_host
,
dist_init_port
=
dist_init_addr
port_base
=
int
(
dist_init_port
)
+
1
port_base
=
int
(
dist_init_port
)
+
1
if
dp_rank
is
None
:
if
dp_rank
is
None
:
scheduler_input_port
=
(
# TokenizerManager to DataParallelController
port_base
+
3
scheduler_input_port
=
port_base
+
3
)
# TokenizerManager to DataParallelController
else
:
else
:
scheduler_input_port
=
port_base
+
3
+
1
+
dp_rank
scheduler_input_port
=
port_base
+
3
+
1
+
dp_rank
...
...
python/sglang/srt/utils.py
View file @
55e03b10
...
@@ -1917,13 +1917,6 @@ def configure_ipv6(dist_init_addr):
...
@@ -1917,13 +1917,6 @@ def configure_ipv6(dist_init_addr):
return
port
,
host
return
port
,
host
def
rank0_log
(
msg
:
str
):
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
if
get_tensor_model_parallel_rank
()
==
0
:
logger
.
info
(
msg
)
def
rank0_print
(
msg
:
str
):
def
rank0_print
(
msg
:
str
):
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
...
@@ -1931,6 +1924,9 @@ def rank0_print(msg: str):
...
@@ -1931,6 +1924,9 @@ def rank0_print(msg: str):
print
(
msg
,
flush
=
True
)
print
(
msg
,
flush
=
True
)
rank0_log
=
rank0_print
def
get_cuda_version
():
def
get_cuda_version
():
if
torch
.
version
.
cuda
:
if
torch
.
version
.
cuda
:
return
tuple
(
map
(
int
,
torch
.
version
.
cuda
.
split
(
"."
)))
return
tuple
(
map
(
int
,
torch
.
version
.
cuda
.
split
(
"."
)))
...
...
sgl-kernel/CMakeLists.txt
View file @
55e03b10
...
@@ -39,14 +39,6 @@ find_package(Torch REQUIRED)
...
@@ -39,14 +39,6 @@ find_package(Torch REQUIRED)
# clean Torch Flag
# clean Torch Flag
clear_cuda_arches
(
CMAKE_FLAG
)
clear_cuda_arches
(
CMAKE_FLAG
)
if
(
"
${
CUDA_VERSION
}
"
VERSION_EQUAL
"12.8"
)
set
(
DeepGEMM_REPO
"https://github.com/sgl-project/DeepGEMM"
)
set
(
DeepGEMM_TAG
"blackwell"
)
else
()
set
(
DeepGEMM_REPO
"https://github.com/deepseek-ai/DeepGEMM"
)
set
(
DeepGEMM_TAG
"8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f"
)
endif
()
include
(
FetchContent
)
include
(
FetchContent
)
# cutlass
# cutlass
...
@@ -57,7 +49,16 @@ FetchContent_Declare(
...
@@ -57,7 +49,16 @@ FetchContent_Declare(
GIT_SHALLOW OFF
GIT_SHALLOW OFF
)
)
FetchContent_Populate
(
repo-cutlass
)
FetchContent_Populate
(
repo-cutlass
)
# DeepGEMM
# DeepGEMM
if
(
"
${
CUDA_VERSION
}
"
VERSION_EQUAL
"12.8"
)
set
(
DeepGEMM_REPO
"https://github.com/sgl-project/DeepGEMM"
)
set
(
DeepGEMM_TAG
"blackwell"
)
else
()
set
(
DeepGEMM_REPO
"https://github.com/deepseek-ai/DeepGEMM"
)
set
(
DeepGEMM_TAG
"8dfa3298274bfe6b242f6f8a3e6f3eff2707dd9f"
)
endif
()
FetchContent_Declare
(
FetchContent_Declare
(
repo-deepgemm
repo-deepgemm
GIT_REPOSITORY
${
DeepGEMM_REPO
}
GIT_REPOSITORY
${
DeepGEMM_REPO
}
...
@@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
...
@@ -107,7 +108,6 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
message
(
STATUS
"For aarch64, disable gencode below SM90 by default"
)
message
(
STATUS
"For aarch64, disable gencode below SM90 by default"
)
endif
()
endif
()
include_directories
(
include_directories
(
${
PROJECT_SOURCE_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/csrc
${
PROJECT_SOURCE_DIR
}
/csrc
...
@@ -247,8 +247,8 @@ set(SOURCES
...
@@ -247,8 +247,8 @@ set(SOURCES
"csrc/moe/ep_moe_reorder_kernel.cu"
"csrc/moe/ep_moe_reorder_kernel.cu"
"csrc/moe/ep_moe_silu_and_mul_kernel.cu"
"csrc/moe/ep_moe_silu_and_mul_kernel.cu"
"csrc/speculative/eagle_utils.cu"
"csrc/speculative/eagle_utils.cu"
"csrc/speculative/speculative_sampling.cu"
"csrc/speculative/packbit.cu"
"csrc/speculative/packbit.cu"
"csrc/speculative/speculative_sampling.cu"
"csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
"csrc/grammar/apply_token_bitmask_inplace_cuda.cu"
"csrc/common_extension.cc"
"csrc/common_extension.cc"
"
${
repo-flashinfer_SOURCE_DIR
}
/csrc/norm.cu"
"
${
repo-flashinfer_SOURCE_DIR
}
/csrc/norm.cu"
...
...
sgl-kernel/python/sgl_kernel/sampling.py
View file @
55e03b10
from
typing
import
Optional
,
Union
from
typing
import
Optional
,
Union
import
torch
import
torch
from
sgl_kernel.utils
import
_to_tensor_scalar_tuple
,
get_cuda_stream
from
sgl_kernel.utils
import
_to_tensor_scalar_tuple
def
_top_k_renorm_probs_internal
(
def
_top_k_renorm_probs_internal
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment