Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
286e6540
Unverified
Commit
286e6540
authored
Mar 05, 2025
by
Lianmin Zheng
Committed by
GitHub
Mar 05, 2025
Browse files
Remove prefill-only-one-req (#4117)
parent
718c391f
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
4 additions
and
17 deletions
+4
-17
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+0
-4
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+3
-12
test/srt/test_bench_serving.py
test/srt/test_bench_serving.py
+1
-1
No files found.
python/sglang/srt/managers/scheduler.py
View file @
286e6540
...
...
@@ -274,10 +274,8 @@ class Scheduler:
target_worker
=
self
.
tp_worker
,
dp_rank
=
dp_rank
,
)
self
.
prefill_only_one_req
=
True
else
:
self
.
draft_worker
=
None
self
.
prefill_only_one_req
=
False
# Get token and memory info from the model worker
(
...
...
@@ -1077,8 +1075,6 @@ class Scheduler:
else
:
self
.
batch_is_full
=
True
break
if
self
.
prefill_only_one_req
:
break
# Update waiting queue
can_run_list
:
List
[
Req
]
=
adder
.
can_run_list
...
...
python/sglang/srt/server_args.py
View file @
286e6540
...
...
@@ -71,7 +71,6 @@ class ServerArgs:
schedule_policy
:
str
=
"fcfs"
schedule_conservativeness
:
float
=
1.0
cpu_offload_gb
:
int
=
0
prefill_only_one_req
:
bool
=
False
# Other runtime options
tp_size
:
int
=
1
...
...
@@ -277,19 +276,17 @@ class ServerArgs:
self
.
speculative_algorithm
=
"EAGLE"
if
self
.
speculative_algorithm
==
"EAGLE"
:
self
.
disable_overlap_schedule
=
True
self
.
prefill_only_one_req
=
True
self
.
disable_cuda_graph_padding
=
True
if
self
.
max_running_requests
is
None
:
self
.
max_running_requests
=
32
self
.
disable_overlap_schedule
=
True
self
.
disable_cuda_graph_padding
=
True
logger
.
info
(
"Overlap scheduler are disabled because of using "
"eagle speculative decoding."
"Max running request set to 32 because of using eagle speculative decoding."
)
# The token generated from the verify step is counted.
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
assert
self
.
speculative_num_steps
<
self
.
speculative_num_draft_tokens
#
assert self.speculative_num_steps < self.speculative_num_draft_tokens
# GGUF
if
(
...
...
@@ -509,12 +506,6 @@ class ServerArgs:
default
=
ServerArgs
.
cpu_offload_gb
,
help
=
"How many GBs of RAM to reserve for CPU offloading"
,
)
parser
.
add_argument
(
"--prefill-only-one-req"
,
type
=
bool
,
help
=
"If true, we only prefill one request at one prefill batch"
,
default
=
ServerArgs
.
prefill_only_one_req
,
)
# Other runtime options
parser
.
add_argument
(
...
...
test/srt/test_bench_serving.py
View file @
286e6540
...
...
@@ -166,7 +166,7 @@ class TestBenchServing(unittest.TestCase):
f
'accept_length :
{
res
[
"accept_length"
]:.
2
f
}
\n
'
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
1100
)
self
.
assertGreater
(
res
[
"accept_length"
],
3.0
)
self
.
assertGreater
(
res
[
"accept_length"
],
2.99
)
def
test_moe_offline_throughput_default
(
self
):
res
=
run_bench_serving
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment