Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
e35a93fa
Unverified
Commit
e35a93fa
authored
Mar 12, 2025
by
Lianmin Zheng
Committed by
GitHub
Mar 12, 2025
Browse files
Move output processing logic from scheduler.py into a separate file (#4354)
parent
2c3656f2
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
634 additions
and
609 deletions
+634
-609
python/sglang/srt/layers/sampler.py
python/sglang/srt/layers/sampler.py
+1
-1
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+0
-22
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+4
-576
python/sglang/srt/managers/scheduler_output_processor_mixin.py
...n/sglang/srt/managers/scheduler_output_processor_mixin.py
+602
-0
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+15
-4
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+12
-6
No files found.
python/sglang/srt/layers/sampler.py
View file @
e35a93fa
import
logging
from
typing
import
List
,
Optional
from
typing
import
List
import
torch
import
torch.distributed
as
dist
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
e35a93fa
...
...
@@ -441,28 +441,6 @@ class Req:
all_ids
=
self
.
origin_input_ids_unpadded
+
self
.
output_ids
return
all_ids
[
self
.
surr_offset
:],
self
.
read_offset
-
self
.
surr_offset
def
get_next_inc_detokenization
(
self
):
if
self
.
tokenizer
is
None
:
return
False
,
""
read_ids
,
read_offset
=
self
.
init_incremental_detokenize
()
surr_ids
=
read_ids
[:
read_offset
]
surr_text
=
self
.
tokenizer
.
decode
(
surr_ids
,
skip_special_tokens
=
self
.
sampling_params
.
skip_special_tokens
,
spaces_between_special_tokens
=
self
.
sampling_params
.
spaces_between_special_tokens
,
)
new_text
=
self
.
tokenizer
.
decode
(
read_ids
,
skip_special_tokens
=
self
.
sampling_params
.
skip_special_tokens
,
spaces_between_special_tokens
=
self
.
sampling_params
.
spaces_between_special_tokens
,
)
if
len
(
new_text
)
>
len
(
surr_text
)
and
not
new_text
.
endswith
(
"�"
):
return
True
,
new_text
[
len
(
surr_text
)
:]
return
False
,
""
def
check_finished
(
self
):
if
self
.
finished
():
return
...
...
python/sglang/srt/managers/scheduler.py
View file @
e35a93fa
This diff is collapsed.
Click to expand it.
python/sglang/srt/managers/scheduler_output_processor_mixin.py
0 → 100644
View file @
e35a93fa
This diff is collapsed.
Click to expand it.
python/sglang/srt/model_executor/model_runner.py
View file @
e35a93fa
...
...
@@ -82,7 +82,6 @@ from sglang.srt.utils import (
logger
=
logging
.
getLogger
(
__name__
)
SGLANG_CI_SMALL_KV_SIZE
=
os
.
getenv
(
"SGLANG_CI_SMALL_KV_SIZE"
,
None
)
UNBALANCED_MODEL_LOADING_TIMEOUT_S
=
300
...
...
@@ -119,6 +118,7 @@ class ModelRunner:
self
.
spec_algorithm
=
SpeculativeAlgorithm
.
from_string
(
server_args
.
speculative_algorithm
)
self
.
page_size
=
server_args
.
page_size
self
.
req_to_token_pool
=
req_to_token_pool
self
.
token_to_kv_pool_allocator
=
token_to_kv_pool_allocator
...
...
@@ -161,6 +161,11 @@ class ModelRunner:
# Get memory before model loading
min_per_gpu_memory
=
self
.
init_torch_distributed
()
# If it is a draft model tp_group can be different.
self
.
initialize
(
min_per_gpu_memory
)
def
initialize
(
self
,
min_per_gpu_memory
:
float
):
server_args
=
self
.
server_args
self
.
memory_saver_adapter
=
TorchMemorySaverAdapter
.
create
(
enable
=
self
.
server_args
.
enable_memory_saver
)
...
...
@@ -300,15 +305,16 @@ class ModelRunner:
min_per_gpu_memory
=
get_available_gpu_memory
(
self
.
device
,
self
.
gpu_id
,
distributed
=
self
.
tp_size
>
1
)
local_gpu_memory
=
get_available_gpu_memory
(
self
.
device
,
self
.
gpu_id
)
self
.
tp_group
=
get_tp_group
()
self
.
attention_tp_group
=
get_attention_tp_group
()
# Check memory for tensor parallelism
local_gpu_memory
=
get_available_gpu_memory
(
self
.
device
,
self
.
gpu_id
)
if
self
.
tp_size
>
1
:
if
min_per_gpu_memory
<
local_gpu_memory
*
0.9
:
raise
ValueError
(
"The memory capacity is unbalanced. Some GPUs may be occupied by other processes."
"The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
f
"
{
min_per_gpu_memory
=
}
,
{
local_gpu_memory
=
}
,
{
local_gpu_memory
*
0.9
=
}
"
)
logger
.
info
(
...
...
@@ -698,6 +704,12 @@ class ModelRunner:
)
self
.
max_total_num_tokens
=
min
(
self
.
max_total_num_tokens
,
max_total_tokens
)
self
.
max_total_num_tokens
=
(
self
.
max_total_num_tokens
//
self
.
server_args
.
page_size
*
self
.
server_args
.
page_size
)
if
self
.
max_total_num_tokens
<=
0
:
raise
RuntimeError
(
"Not enough memory. Please try to increase --mem-fraction-static."
...
...
@@ -783,7 +795,6 @@ class ModelRunner:
# Init streams
if
self
.
server_args
.
speculative_algorithm
==
"EAGLE"
:
self
.
plan_stream_for_flashinfer
=
torch
.
cuda
.
Stream
()
self
.
attn_backend
=
FlashInferAttnBackend
(
self
)
elif
self
.
server_args
.
attention_backend
==
"triton"
:
assert
self
.
sliding_window_size
is
None
,
(
...
...
python/sglang/srt/server_args.py
View file @
e35a93fa
...
...
@@ -20,14 +20,13 @@ import random
import
tempfile
from
typing
import
List
,
Optional
import
torch
from
sglang.srt.hf_transformers_utils
import
check_gguf_file
from
sglang.srt.reasoning_parser
import
ReasoningParser
from
sglang.srt.utils
import
(
get_amdgpu_memory_capacity
,
get_hpu_memory_capacity
,
get_nvgpu_memory_capacity
,
is_cuda
,
is_flashinfer_available
,
is_hip
,
is_port_available
,
...
...
@@ -71,6 +70,7 @@ class ServerArgs:
schedule_policy
:
str
=
"fcfs"
schedule_conservativeness
:
float
=
1.0
cpu_offload_gb
:
int
=
0
page_size
:
int
=
1
# Other runtime options
tp_size
:
int
=
1
...
...
@@ -190,10 +190,10 @@ class ServerArgs:
if
self
.
random_seed
is
None
:
self
.
random_seed
=
random
.
randint
(
0
,
1
<<
30
)
if
is_hip
():
gpu_mem
=
get_amdgpu_memory_capacity
()
elif
torch
.
cuda
.
is_available
():
if
is_cuda
():
gpu_mem
=
get_nvgpu_memory_capacity
()
elif
is_hip
():
gpu_mem
=
get_amdgpu_memory_capacity
()
elif
self
.
device
==
"hpu"
:
gpu_mem
=
get_hpu_memory_capacity
()
else
:
...
...
@@ -258,7 +258,7 @@ class ServerArgs:
f
"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[
{
self
.
tp_size
}
]."
)
#
Others
#
Data parallelism attention
if
self
.
enable_dp_attention
:
self
.
dp_size
=
self
.
tp_size
assert
self
.
tp_size
%
self
.
dp_size
==
0
...
...
@@ -507,6 +507,12 @@ class ServerArgs:
default
=
ServerArgs
.
cpu_offload_gb
,
help
=
"How many GBs of RAM to reserve for CPU offloading."
,
)
parser
.
add_argument
(
"--page-size"
,
type
=
int
,
default
=
ServerArgs
.
page_size
,
help
=
"The number of tokens in a page."
,
)
# Other runtime options
parser
.
add_argument
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment