Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
050f285f
Unverified
Commit
050f285f
authored
Apr 23, 2024
by
SangBin Cho
Committed by
GitHub
Apr 23, 2024
Browse files
[Core] Scheduling optimization 2 (#4280)
parent
8f2ea22b
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
15 additions
and
3 deletions
+15
-3
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+2
-1
vllm/core/scheduler.py
vllm/core/scheduler.py
+8
-2
vllm/sequence.py
vllm/sequence.py
+5
-0
No files found.
tests/core/test_scheduler.py
View file @
050f285f
...
@@ -563,7 +563,8 @@ def test_decode_schedule_preempted():
...
@@ -563,7 +563,8 @@ def test_decode_schedule_preempted():
assert
len
(
output
.
preempted
)
==
2
assert
len
(
output
.
preempted
)
==
2
# Verify budgets are updated.
# Verify budgets are updated.
assert
budget
.
num_batched_tokens
==
1
assert
budget
.
num_batched_tokens
==
1
assert
budget
.
num_curr_seqs
==
1
# NOTE: When enable_chunk is False, num_seqs budget is not updated.
# assert budget.num_curr_seqs == 1
# Both should be preempted, not swapped.
# Both should be preempted, not swapped.
assert
output
.
blocks_to_swap_out
==
{}
assert
output
.
blocks_to_swap_out
==
{}
# Nothing is copied.
# Nothing is copied.
...
...
vllm/core/scheduler.py
View file @
050f285f
...
@@ -395,12 +395,12 @@ class Scheduler:
...
@@ -395,12 +395,12 @@ class Scheduler:
# We can have up to 1 running prefill at any given time in running
# We can have up to 1 running prefill at any given time in running
# queue, which means we can guarantee chunk size is at least 1.
# queue, which means we can guarantee chunk size is at least 1.
assert
num_running_tokens
!=
0
assert
num_running_tokens
!=
0
num_running_seqs
=
seq_group
.
get_max_num_running_seqs
()
running_queue
.
popleft
()
running_queue
.
popleft
()
while
not
self
.
_can_append_slots
(
seq_group
):
while
not
self
.
_can_append_slots
(
seq_group
):
budget
.
subtract_num_batched_tokens
(
seq_group
.
request_id
,
budget
.
subtract_num_batched_tokens
(
seq_group
.
request_id
,
num_running_tokens
)
num_running_tokens
)
num_running_seqs
=
seq_group
.
get_max_num_running_seqs
()
budget
.
subtract_num_seqs
(
seq_group
.
request_id
,
budget
.
subtract_num_seqs
(
seq_group
.
request_id
,
num_running_seqs
)
num_running_seqs
)
if
curr_loras
is
not
None
and
seq_group
.
lora_int_id
>
0
:
if
curr_loras
is
not
None
and
seq_group
.
lora_int_id
>
0
:
...
@@ -439,6 +439,12 @@ class Scheduler:
...
@@ -439,6 +439,12 @@ class Scheduler:
token_chunk_size
=
1
))
token_chunk_size
=
1
))
budget
.
add_num_batched_tokens
(
seq_group
.
request_id
,
budget
.
add_num_batched_tokens
(
seq_group
.
request_id
,
num_running_tokens
)
num_running_tokens
)
# OPTIMIZATION: Note that get_max_num_running_seqs is
# expensive. For the default scheduling chase where
# enable_chunking is False, num_seqs are updated before running
# this method, so we don't have to update it again here.
if
enable_chunking
:
num_running_seqs
=
seq_group
.
get_max_num_running_seqs
()
budget
.
add_num_seqs
(
seq_group
.
request_id
,
num_running_seqs
)
budget
.
add_num_seqs
(
seq_group
.
request_id
,
num_running_seqs
)
if
curr_loras
is
not
None
and
seq_group
.
lora_int_id
>
0
:
if
curr_loras
is
not
None
and
seq_group
.
lora_int_id
>
0
:
curr_loras
.
add
(
seq_group
.
lora_int_id
)
curr_loras
.
add
(
seq_group
.
lora_int_id
)
...
...
vllm/sequence.py
View file @
050f285f
...
@@ -508,6 +508,11 @@ class SequenceGroup:
...
@@ -508,6 +508,11 @@ class SequenceGroup:
return
num_uncomputed_tokens
return
num_uncomputed_tokens
def
num_seqs
(
self
,
status
:
Optional
[
SequenceStatus
]
=
None
)
->
int
:
def
num_seqs
(
self
,
status
:
Optional
[
SequenceStatus
]
=
None
)
->
int
:
# Optimization. We don't need to call get_seqs if we don't need to
# filter by states.
if
status
is
None
:
return
len
(
self
.
seqs_dict
)
return
len
(
self
.
get_seqs
(
status
))
return
len
(
self
.
get_seqs
(
status
))
def
num_unfinished_seqs
(
self
)
->
int
:
def
num_unfinished_seqs
(
self
)
->
int
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment