Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cf2f084d
Unverified
Commit
cf2f084d
authored
Mar 22, 2024
by
Thomas Parnell
Committed by
GitHub
Mar 22, 2024
Browse files
Dynamic scheduler delay to improve ITL performance (#3279)
Co-authored-by:
Jan van Lunteren
<
jvl@zurich.ibm.com
>
parent
f721096d
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
72 additions
and
2 deletions
+72
-2
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+34
-0
vllm/config.py
vllm/config.py
+4
-0
vllm/core/scheduler.py
vllm/core/scheduler.py
+25
-1
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+9
-1
No files found.
tests/core/test_scheduler.py
View file @
cf2f084d
from
typing
import
List
import
pytest
# noqa
import
time
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.scheduler
import
Scheduler
...
...
@@ -168,3 +169,36 @@ def test_scheduler_max_seqs():
# and one is prompting.
_
,
out
=
scheduler
.
schedule
()
assert
set
(
out
.
scheduled_seq_groups
)
==
set
([
all_seq_groups
[
1
]])
def
test_scheduler_delay_factor
():
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# schedule first prompt
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
out
.
prompt_run
assert
seq_group_meta
[
0
].
request_id
==
'0'
# wait for a second before scheduling next prompt
time
.
sleep
(
1
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
# second prompt should *not* be scheduled
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
not
out
.
prompt_run
assert
seq_group_meta
[
0
].
request_id
==
'0'
# wait for more than 0.5 second and try again
time
.
sleep
(
0.6
)
seq_group_meta
,
out
=
scheduler
.
schedule
()
assert
out
.
prompt_run
assert
seq_group_meta
[
0
].
request_id
==
'1'
vllm/config.py
View file @
cf2f084d
...
...
@@ -517,6 +517,8 @@ class SchedulerConfig:
iteration.
max_model_len: Maximum length of a sequence (including prompt
and generated text).
delay_factor: Apply a delay (of delay factor multiplied by previous
prompt latency) before scheduling next prompt.
"""
def
__init__
(
...
...
@@ -524,6 +526,7 @@ class SchedulerConfig:
max_num_batched_tokens
:
Optional
[
int
],
max_num_seqs
:
int
,
max_model_len
:
int
,
delay_factor
:
float
=
0.0
,
)
->
None
:
if
max_num_batched_tokens
is
not
None
:
self
.
max_num_batched_tokens
=
max_num_batched_tokens
...
...
@@ -533,6 +536,7 @@ class SchedulerConfig:
self
.
max_num_batched_tokens
=
max
(
max_model_len
,
2048
)
self
.
max_num_seqs
=
max_num_seqs
self
.
max_model_len
=
max_model_len
self
.
delay_factor
=
delay_factor
self
.
_verify_args
()
def
_verify_args
(
self
)
->
None
:
...
...
vllm/core/scheduler.py
View file @
cf2f084d
...
...
@@ -103,6 +103,13 @@ class Scheduler:
# Sequence groups in the SWAPPED state.
self
.
swapped
:
Deque
[
SequenceGroup
]
=
deque
()
# Time at previous scheduling step
self
.
prev_time
=
0.0
# Did we schedule a prompt at previous step?
self
.
prev_prompt
=
False
# Latency of the last prompt step
self
.
last_prompt_latency
=
0.0
@
property
def
lora_enabled
(
self
)
->
bool
:
return
bool
(
self
.
lora_config
)
...
...
@@ -179,7 +186,7 @@ class Scheduler:
# are added to the back.
leftover_waiting_sequences
=
deque
()
num_batched_tokens
=
0
while
self
.
waiting
:
while
self
.
_passed_delay
(
now
)
and
self
.
waiting
:
seq_group
=
self
.
waiting
[
0
]
waiting_seqs
=
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
WAITING
)
...
...
@@ -246,6 +253,7 @@ class Scheduler:
self
.
waiting
.
extendleft
(
leftover_waiting_sequences
)
if
scheduled
or
ignored_seq_groups
:
self
.
prev_prompt
=
True
scheduler_outputs
=
SchedulerOutputs
(
scheduled_seq_groups
=
scheduled
,
prompt_run
=
True
,
...
...
@@ -491,3 +499,19 @@ class Scheduler:
def
mark_blocks_as_computed
(
self
,
seq_group
:
SequenceGroup
):
self
.
block_manager
.
mark_blocks_as_computed
(
seq_group
)
def
_passed_delay
(
self
,
now
:
float
)
->
bool
:
if
self
.
prev_prompt
:
self
.
last_prompt_latency
=
now
-
self
.
prev_time
self
.
prev_time
,
self
.
prev_prompt
=
now
,
False
# Delay scheduling prompts to let waiting queue fill up
if
self
.
scheduler_config
.
delay_factor
>
0
and
self
.
waiting
:
earliest_arrival_time
=
min
(
[
e
.
metrics
.
arrival_time
for
e
in
self
.
waiting
])
passed_delay
=
(
(
now
-
earliest_arrival_time
)
>
(
self
.
scheduler_config
.
delay_factor
*
self
.
last_prompt_latency
)
or
not
self
.
running
)
else
:
passed_delay
=
True
return
passed_delay
vllm/engine/arg_utils.py
View file @
cf2f084d
...
...
@@ -51,6 +51,7 @@ class EngineArgs:
max_cpu_loras
:
Optional
[
int
]
=
None
device
:
str
=
'auto'
ray_workers_use_nsight
:
bool
=
False
scheduler_delay_factor
:
float
=
0.0
def
__post_init__
(
self
):
if
self
.
tokenizer
is
None
:
...
...
@@ -305,6 +306,12 @@ class EngineArgs:
default
=
EngineArgs
.
device
,
choices
=
[
"auto"
,
"cuda"
,
"neuron"
],
help
=
'Device type for vLLM execution.'
)
parser
.
add_argument
(
'--scheduler-delay-factor'
,
type
=
float
,
default
=
EngineArgs
.
scheduler_delay_factor
,
help
=
'Apply a delay (of delay factor multiplied by previous'
'prompt latency) before scheduling next prompt.'
)
return
parser
@
classmethod
...
...
@@ -342,7 +349,8 @@ class EngineArgs:
),
self
.
ray_workers_use_nsight
)
scheduler_config
=
SchedulerConfig
(
self
.
max_num_batched_tokens
,
self
.
max_num_seqs
,
model_config
.
max_model_len
)
model_config
.
max_model_len
,
self
.
scheduler_delay_factor
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
self
.
max_lora_rank
,
max_loras
=
self
.
max_loras
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment