Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
f7102fbd
Unverified
Commit
f7102fbd
authored
Oct 30, 2024
by
Lianmin Zheng
Committed by
GitHub
Oct 30, 2024
Browse files
Fix mixed chunked prefill (#1850)
parent
a7a0a688
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
80 additions
and
23 deletions
+80
-23
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+5
-3
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+68
-20
test/srt/test_chunked_prefill.py
test/srt/test_chunked_prefill.py
+7
-0
No files found.
python/sglang/srt/managers/scheduler.py
View file @
f7102fbd
...
...
@@ -720,9 +720,11 @@ class Scheduler:
# Mixed-style chunked prefill
if
self
.
is_mixed_chunk
and
self
.
running_batch
is
not
None
:
self
.
running_batch
.
prepare_for_decode
(
self
.
enable_overlap
)
new_batch
.
mix_with_running
(
self
.
running_batch
)
new_batch
.
decoding_reqs
=
self
.
running_batch
.
reqs
self
.
running_batch
.
filter_batch
()
if
not
self
.
running_batch
.
is_empty
():
self
.
running_batch
.
prepare_for_decode
(
self
.
enable_overlap
)
new_batch
.
mix_with_running
(
self
.
running_batch
)
new_batch
.
decoding_reqs
=
self
.
running_batch
.
reqs
self
.
running_batch
=
None
else
:
new_batch
.
decoding_reqs
=
None
...
...
python/sglang/test/test_utils.py
View file @
f7102fbd
...
...
@@ -7,6 +7,7 @@ import random
import
subprocess
import
threading
import
time
from
concurrent.futures
import
ThreadPoolExecutor
from
functools
import
partial
from
types
import
SimpleNamespace
from
typing
import
Callable
,
List
,
Optional
...
...
@@ -656,11 +657,12 @@ def read_output(output_lines):
time
.
sleep
(
0.1
)
def
run_mmlu_test
(
def
run_and_check_memory_leak
(
workload_func
,
disable_radix_cache
,
enable_mixed_chunk
=
False
,
enable_overlap
=
False
,
chunked_prefill_size
=
32
,
enable_mixed_chunk
,
enable_overlap
,
chunked_prefill_size
,
):
other_args
=
[
"--chunked-prefill-size"
,
str
(
chunked_prefill_size
)]
if
disable_radix_cache
:
...
...
@@ -690,21 +692,8 @@ def run_mmlu_test(
t
=
threading
.
Thread
(
target
=
read_output
,
args
=
(
output_lines
,))
t
.
start
()
# Run the eval
args
=
SimpleNamespace
(
base_url
=
base_url
,
model
=
model
,
eval_name
=
"mmlu"
,
num_examples
=
128
,
num_threads
=
128
,
)
try
:
metrics
=
run_eval
(
args
)
print
(
f
"
{
metrics
=
}
"
)
assert
metrics
[
"score"
]
>=
0.65
finally
:
pass
# Run the workload
workload_func
(
base_url
,
model
)
# Clean up everything
kill_child_process
(
process
.
pid
,
include_self
=
True
)
...
...
@@ -727,4 +716,63 @@ def run_mmlu_test(
has_leak
=
True
assert
has_new_server
# assert not has_leak
assert
not
has_leak
def
run_mmlu_test
(
disable_radix_cache
=
False
,
enable_mixed_chunk
=
False
,
enable_overlap
=
False
,
chunked_prefill_size
=
32
,
):
def
workload_func
(
base_url
,
model
):
# Run the eval
args
=
SimpleNamespace
(
base_url
=
base_url
,
model
=
model
,
eval_name
=
"mmlu"
,
num_examples
=
128
,
num_threads
=
128
,
)
try
:
metrics
=
run_eval
(
args
)
print
(
f
"
{
metrics
=
}
"
)
assert
metrics
[
"score"
]
>=
0.65
finally
:
pass
run_and_check_memory_leak
(
workload_func
,
disable_radix_cache
,
enable_mixed_chunk
,
enable_overlap
,
chunked_prefill_size
)
def
run_mulit_request_test
(
disable_radix_cache
=
False
,
enable_mixed_chunk
=
False
,
enable_overlap
=
False
,
chunked_prefill_size
=
32
,
):
def
workload_func
(
base_url
,
model
):
def
run_one
(
_
):
prompt
=
"""
System: You are a helpful assistant.
User: What is the capital of France?
Assistant: The capital of France is
"""
response
=
requests
.
post
(
f
"
{
base_url
}
/generate"
,
json
=
{
"text"
:
prompt
,
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
8
,
},
},
)
ret
=
response
.
json
()
with
ThreadPoolExecutor
(
2
)
as
executor
:
list
(
executor
.
map
(
run_one
,
list
(
range
(
4
))))
run_and_check_memory_leak
(
workload_func
,
disable_radix_cache
,
enable_mixed_chunk
,
enable_overlap
,
chunked_prefill_size
)
test/srt/test_chunked_prefill.py
View file @
f7102fbd
...
...
@@ -8,6 +8,7 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST
,
run_bench_serving
,
run_mmlu_test
,
run_mulit_request_test
,
)
...
...
@@ -39,6 +40,12 @@ class TestChunkedPrefill(unittest.TestCase):
assert
res
[
"completed"
]
==
10
def
test_mixed_chunked_prefill_multi_requests
(
self
):
run_mulit_request_test
(
enable_mixed_chunk
=
True
,
chunked_prefill_size
=
2048
,
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment