Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
88577ac9
Unverified
Commit
88577ac9
authored
Sep 23, 2024
by
sroy745
Committed by
GitHub
Sep 24, 2024
Browse files
Fix tests in test_scheduler.py that fail with BlockManager V2 (#8728)
parent
530821d0
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
260 additions
and
89 deletions
+260
-89
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+260
-89
No files found.
tests/core/test_scheduler.py
View file @
88577ac9
...
@@ -3,7 +3,8 @@ from collections import deque
...
@@ -3,7 +3,8 @@ from collections import deque
from
typing
import
List
,
Set
,
Tuple
from
typing
import
List
,
Set
,
Tuple
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
import
pytest
# noqa
import
pytest
from
torch
import
Use
# noqa
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.interfaces
import
AllocStatus
...
@@ -16,9 +17,11 @@ from .utils import (append_new_token, append_new_token_seq_group,
...
@@ -16,9 +17,11 @@ from .utils import (append_new_token, append_new_token_seq_group,
schedule_and_update_computed_tokens
)
schedule_and_update_computed_tokens
)
def
test_scheduler_add_seq_group
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_add_seq_group
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
cache_dtype
=
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
...
@@ -27,14 +30,18 @@ def test_scheduler_add_seq_group():
...
@@ -27,14 +30,18 @@ def test_scheduler_add_seq_group():
# Add seq group to scheduler.
# Add seq group to scheduler.
num_seq_group
=
4
num_seq_group
=
4
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
i
+
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
i
+
1
def
test_scheduler_abort_seq_group
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_abort_seq_group
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
1
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_cpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
cache_config
.
num_gpu_blocks
=
4
...
@@ -54,11 +61,16 @@ def test_scheduler_abort_seq_group():
...
@@ -54,11 +61,16 @@ def test_scheduler_abort_seq_group():
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
0
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
0
def
test_scheduler_schedule_simple
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_schedule_simple
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
64
,
num_seq_group
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -67,7 +79,9 @@ def test_scheduler_schedule_simple():
...
@@ -67,7 +79,9 @@ def test_scheduler_schedule_simple():
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
running
.
append
(
seq_group
)
running
.
append
(
seq_group
)
...
@@ -91,20 +105,24 @@ def test_scheduler_schedule_simple():
...
@@ -91,20 +105,24 @@ def test_scheduler_schedule_simple():
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
def
test_scheduler_prefill_prioritized
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_prefill_prioritized
(
use_v2_block_manager
:
bool
):
"""Verify running batched tokens are not applied to prefill requests."""
"""Verify running batched tokens are not applied to prefill requests."""
block_size
=
4
block_size
=
4
max_model_len
=
30
max_model_len
=
30
max_batched_num_tokens
=
30
max_batched_num_tokens
=
30
scheduler_config
=
SchedulerConfig
(
max_batched_num_tokens
,
2
,
scheduler_config
=
SchedulerConfig
(
max_model_len
)
max_batched_num_tokens
,
2
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_cpu_blocks
=
16
cache_config
.
num_gpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
16
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
# Add seq groups to scheduler.
_
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
1
)
_
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
1
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_a
)
# Schedule seq groups prompts.
# Schedule seq groups prompts.
...
@@ -112,7 +130,7 @@ def test_scheduler_prefill_prioritized():
...
@@ -112,7 +130,7 @@ def test_scheduler_prefill_prioritized():
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
assert
get_sequence_groups
(
out
)
==
[
seq_group_a
]
# Add a new prefill request B.
# Add a new prefill request B.
_
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
30
)
_
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
30
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group_b
)
scheduler
.
add_seq_group
(
seq_group_b
)
# Verify prefill requests are prioritized. Since max_batched_num_tokens
# Verify prefill requests are prioritized. Since max_batched_num_tokens
...
@@ -121,18 +139,24 @@ def test_scheduler_prefill_prioritized():
...
@@ -121,18 +139,24 @@ def test_scheduler_prefill_prioritized():
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
assert
get_sequence_groups
(
out
)
==
[
seq_group_b
]
def
test_scheduler_schedule_preempt_abort
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_schedule_preempt_abort
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
2
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
64
,
2
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_cpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
cache_config
.
num_gpu_blocks
=
2
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
None
)
# Add seq groups to scheduler.
# Add seq groups to scheduler.
seq_a
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
block_size
)
seq_a
,
seq_group_a
=
create_dummy_prompt
(
"1"
,
seq_b
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
block_size
)
block_size
,
block_size
=
block_size
)
seq_b
,
seq_group_b
=
create_dummy_prompt
(
"2"
,
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_a
)
scheduler
.
add_seq_group
(
seq_group_b
)
scheduler
.
add_seq_group
(
seq_group_b
)
...
@@ -170,12 +194,17 @@ def test_scheduler_schedule_preempt_abort():
...
@@ -170,12 +194,17 @@ def test_scheduler_schedule_preempt_abort():
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
1
assert
scheduler
.
get_num_unfinished_seq_groups
()
==
1
def
test_scheduler_max_seqs
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_max_seqs
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
num_seq_group
=
4
num_seq_group
=
4
max_seq_group
=
2
max_seq_group
=
2
max_model_len
=
16
max_model_len
=
16
scheduler_config
=
SchedulerConfig
(
64
,
max_seq_group
,
max_model_len
)
scheduler_config
=
SchedulerConfig
(
64
,
max_seq_group
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -184,7 +213,9 @@ def test_scheduler_max_seqs():
...
@@ -184,7 +213,9 @@ def test_scheduler_max_seqs():
all_seq_groups
:
List
[
SequenceGroup
]
=
[]
all_seq_groups
:
List
[
SequenceGroup
]
=
[]
# Add seq groups to scheduler.
# Add seq groups to scheduler.
for
i
in
range
(
num_seq_group
):
for
i
in
range
(
num_seq_group
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
block_size
,
block_size
=
block_size
)
all_seq_groups
.
append
(
seq_group
)
all_seq_groups
.
append
(
seq_group
)
# Append 1 seq group
# Append 1 seq group
...
@@ -211,9 +242,15 @@ def test_scheduler_max_seqs():
...
@@ -211,9 +242,15 @@ def test_scheduler_max_seqs():
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
1
]])
assert
set
(
get_sequence_groups
(
out
))
==
set
([
all_seq_groups
[
1
]])
def
test_scheduler_delay_factor
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_scheduler_delay_factor
(
use_v2_block_manager
:
bool
):
block_size
=
4
block_size
=
4
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
)
scheduler_config
=
SchedulerConfig
(
100
,
64
,
16
,
delay_factor
=
0.5
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
8
...
@@ -221,7 +258,8 @@ def test_scheduler_delay_factor():
...
@@ -221,7 +258,8 @@ def test_scheduler_delay_factor():
# schedule first prompt
# schedule first prompt
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"0"
,
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
block_size
)
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
out
.
num_prefill_groups
>
0
assert
out
.
num_prefill_groups
>
0
...
@@ -231,7 +269,8 @@ def test_scheduler_delay_factor():
...
@@ -231,7 +269,8 @@ def test_scheduler_delay_factor():
# wait for a second before scheduling next prompt
# wait for a second before scheduling next prompt
time
.
sleep
(
1
)
time
.
sleep
(
1
)
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"1"
,
seq_group_meta
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
block_size
)
prompt_length
=
block_size
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# second prompt should *not* be scheduled
# second prompt should *not* be scheduled
...
@@ -248,11 +287,20 @@ def test_scheduler_delay_factor():
...
@@ -248,11 +287,20 @@ def test_scheduler_delay_factor():
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
def
test_swapped_out_prioritized
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
(
max_num_seqs
=
6
)
def
test_swapped_out_prioritized
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
max_num_seqs
=
6
,
block_size
=
block_size
,
use_v2_block_manager
=
use_v2_block_manager
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
# best_of=2 * 3 == 6 sequences.
# best_of=2 * 3 == 6 sequences.
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
# prefill scheduled now.
# prefill scheduled now.
...
@@ -276,7 +324,10 @@ def test_swapped_out_prioritized():
...
@@ -276,7 +324,10 @@ def test_swapped_out_prioritized():
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
# Add 1 more task. Swap should be prioritized over prefill.
# Add 1 more task. Swap should be prioritized over prefill.
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
seq_group_meta
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
append_new_token
(
out
,
1
)
append_new_token
(
out
,
1
)
...
@@ -287,17 +338,26 @@ def test_swapped_out_prioritized():
...
@@ -287,17 +338,26 @@ def test_swapped_out_prioritized():
assert
out
.
blocks_to_swap_out
==
[]
assert
out
.
blocks_to_swap_out
==
[]
def
initialize_scheduler
(
*
,
def
initialize_scheduler
(
*
,
max_num_seqs
=
1000
,
max_num_seqs
=
1000
,
max_token_budget
=
1000
,
max_token_budget
=
1000
,
max_model_len
=
1000
,
max_model_len
=
1000
,
lora_config
=
None
):
lora_config
=
None
,
block_size
=
4
use_v2_block_manager
=
False
,
scheduler_config
=
SchedulerConfig
(
max_token_budget
,
max_num_seqs
,
block_size
=
4
,
max_model_len
)
num_cpu_blocks
=
8
,
num_gpu_blocks
=
8
,
):
block_size
=
block_size
scheduler_config
=
SchedulerConfig
(
max_token_budget
,
max_num_seqs
,
max_model_len
,
use_v2_block_manager
=
use_v2_block_manager
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
=
CacheConfig
(
block_size
,
1.0
,
1
,
"auto"
)
cache_config
.
num_cpu_blocks
=
8
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
cache_config
.
num_gpu_blocks
=
8
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
return
scheduler
return
scheduler
...
@@ -319,12 +379,18 @@ def add_token_budget(budget: SchedulingBudget,
...
@@ -319,12 +379,18 @@ def add_token_budget(budget: SchedulingBudget,
budget
.
add_num_seqs
(
mock_seq_group
.
request_id
,
num_curr_seqs
)
budget
.
add_num_seqs
(
mock_seq_group
.
request_id
,
num_curr_seqs
)
def
test_prefill_schedule_max_prompt_len
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_max_prompt_len
(
use_v2_block_manager
:
bool
):
"""
"""
Test prompt longer than max_prompt_len is aborted.
Test prompt longer than max_prompt_len is aborted.
"""
"""
scheduler
=
initialize_scheduler
(
max_model_len
=
30
)
block_size
=
4
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
)
scheduler
=
initialize_scheduler
(
max_model_len
=
30
,
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
)
_
,
seq_group
=
create_dummy_prompt
(
"0"
,
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
budget
=
create_token_budget
()
budget
=
create_token_budget
()
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
...
@@ -336,14 +402,21 @@ def test_prefill_schedule_max_prompt_len():
...
@@ -336,14 +402,21 @@ def test_prefill_schedule_max_prompt_len():
assert
len
(
remaining_waiting
)
==
0
assert
len
(
remaining_waiting
)
==
0
def
test_prefill_schedule_token_budget
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_token_budget
(
use_v2_block_manager
:
bool
):
"""
"""
Test token budget respected.
Test token budget respected.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
token_budget
=
0
)
budget
=
create_token_budget
(
token_budget
=
0
)
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# 0 token budget == nothing is scheduled.
# 0 token budget == nothing is scheduled.
...
@@ -366,10 +439,15 @@ def test_prefill_schedule_token_budget():
...
@@ -366,10 +439,15 @@ def test_prefill_schedule_token_budget():
assert
len
(
remaining_waiting
)
==
1
assert
len
(
remaining_waiting
)
==
1
# Test when current_batched_tokens respected.
# Test when current_batched_tokens respected.
scheduler
=
initialize_scheduler
()
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
budget
=
create_token_budget
(
token_budget
=
60
)
budget
=
create_token_budget
(
token_budget
=
60
)
add_token_budget
(
budget
,
30
,
0
)
add_token_budget
(
budget
,
30
,
0
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
# Cannot schedule a prompt that doesn't fit the budget.
# Cannot schedule a prompt that doesn't fit the budget.
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
...
@@ -389,14 +467,21 @@ def test_prefill_schedule_token_budget():
...
@@ -389,14 +467,21 @@ def test_prefill_schedule_token_budget():
assert
len
(
remaining_waiting
)
==
0
assert
len
(
remaining_waiting
)
==
0
def
test_prefill_schedule_max_seqs
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_max_seqs
(
use_v2_block_manager
:
bool
):
"""
"""
Test max seq respected.
Test max seq respected.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
remaining_waiting
=
scheduler
.
waiting
remaining_waiting
=
scheduler
.
waiting
...
@@ -410,7 +495,9 @@ def test_prefill_schedule_max_seqs():
...
@@ -410,7 +495,9 @@ def test_prefill_schedule_max_seqs():
scheduler
.
waiting
=
deque
()
scheduler
.
waiting
=
deque
()
budget
=
create_token_budget
(
max_num_seqs
=
2
)
budget
=
create_token_budget
(
max_num_seqs
=
2
)
add_token_budget
(
budget
,
0
,
2
)
add_token_budget
(
budget
,
0
,
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
None
)
remaining_waiting
=
scheduler
.
waiting
remaining_waiting
=
scheduler
.
waiting
...
@@ -421,17 +508,24 @@ def test_prefill_schedule_max_seqs():
...
@@ -421,17 +508,24 @@ def test_prefill_schedule_max_seqs():
assert
len
(
remaining_waiting
)
==
1
assert
len
(
remaining_waiting
)
==
1
def
test_prefill_schedule_max_lora
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_max_lora
(
use_v2_block_manager
:
bool
):
"""
"""
Test max lora is respected and prioritized.
Test max lora is respected and prioritized.
"""
"""
block_size
=
4
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
budget
=
create_token_budget
(
token_budget
=
120
)
budget
=
create_token_budget
(
token_budget
=
120
)
curr_loras
:
Set
[
int
]
=
set
()
curr_loras
:
Set
[
int
]
=
set
()
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
block_size
=
block_size
,
lora_request
=
LoRARequest
(
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_int_id
=
i
+
1
,
...
@@ -443,7 +537,9 @@ def test_prefill_schedule_max_lora():
...
@@ -443,7 +537,9 @@ def test_prefill_schedule_max_lora():
# If a request is not scheduled because it hits max lora, it is
# If a request is not scheduled because it hits max lora, it is
# prioritized. Verify that.
# prioritized. Verify that.
for
i
in
range
(
2
,
4
):
for
i
in
range
(
2
,
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
# Schedule 2 requests (0 and 2)
# Schedule 2 requests (0 and 2)
output
=
scheduler
.
_schedule_prefills
(
budget
,
curr_loras
)
output
=
scheduler
.
_schedule_prefills
(
budget
,
curr_loras
)
...
@@ -467,14 +563,21 @@ def test_prefill_schedule_max_lora():
...
@@ -467,14 +563,21 @@ def test_prefill_schedule_max_lora():
assert
budget
.
num_batched_tokens
==
60
assert
budget
.
num_batched_tokens
==
60
def
test_prefill_schedule_no_block_manager_capacity
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_prefill_schedule_no_block_manager_capacity
(
use_v2_block_manager
):
"""
"""
Test sequence cannot be scheduled due to block manager has no capacity.
Test sequence cannot be scheduled due to block manager has no capacity.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_gpu_blocks
=
128
,
num_cpu_blocks
=
128
)
budget
=
create_token_budget
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
LATER
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
LATER
...
@@ -489,7 +592,9 @@ def test_prefill_schedule_no_block_manager_capacity():
...
@@ -489,7 +592,9 @@ def test_prefill_schedule_no_block_manager_capacity():
scheduler
=
initialize_scheduler
()
scheduler
=
initialize_scheduler
()
budget
=
create_token_budget
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
add_seq_group
(
seq_group
)
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
=
MagicMock
()
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
NEVER
scheduler
.
block_manager
.
can_allocate
.
return_value
=
AllocStatus
.
NEVER
...
@@ -502,14 +607,21 @@ def test_prefill_schedule_no_block_manager_capacity():
...
@@ -502,14 +607,21 @@ def test_prefill_schedule_no_block_manager_capacity():
assert
len
(
remaining_waiting
)
==
0
assert
len
(
remaining_waiting
)
==
0
def
test_decode_schedule_preempted
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_decode_schedule_preempted
(
use_v2_block_manager
:
bool
):
"""
"""
Test decodes cannot be scheduled and preempted.
Test decodes cannot be scheduled and preempted.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
...
@@ -541,15 +653,23 @@ def test_decode_schedule_preempted():
...
@@ -541,15 +653,23 @@ def test_decode_schedule_preempted():
assert
output
.
blocks_to_copy
==
[]
assert
output
.
blocks_to_copy
==
[]
def
test_decode_swap_beam_search
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_decode_swap_beam_search
(
use_v2_block_manager
:
bool
):
"""
"""
Test best_of > 1 swap out blocks
Test best_of > 1 swap out blocks
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_gpu_blocks
=
64
,
num_cpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
budget
=
create_token_budget
()
budget
=
create_token_budget
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
scheduler
.
_add_seq_group_to_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
...
@@ -589,12 +709,20 @@ def test_decode_swap_beam_search():
...
@@ -589,12 +709,20 @@ def test_decode_swap_beam_search():
assert
output
.
blocks_to_copy
==
[]
assert
output
.
blocks_to_copy
==
[]
def
test_schedule_decode_blocks_to_copy_update
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_decode_blocks_to_copy_update
(
use_v2_block_manager
:
bool
):
"""
"""
Verify blocks_to_copy is updated.
Verify blocks_to_copy is updated.
"""
"""
scheduler
=
initialize_scheduler
()
block_size
=
4
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
4
,
num_cpu_blocks
=
16
,
num_gpu_blocks
=
16
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
curr_loras
=
None
curr_loras
=
None
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
...
@@ -644,12 +772,17 @@ def test_schedule_swapped_simple():
...
@@ -644,12 +772,17 @@ def test_schedule_swapped_simple():
assert
blocks_to_swap_out
==
blocks_to_swap_in_reverse
assert
blocks_to_swap_out
==
blocks_to_swap_in_reverse
def
test_schedule_swapped_max_token_budget
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_schedule_swapped_max_token_budget
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
)
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -676,12 +809,19 @@ def test_schedule_swapped_max_token_budget():
...
@@ -676,12 +809,19 @@ def test_schedule_swapped_max_token_budget():
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_max_seqs
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_schedule_swapped_max_seqs
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
64
,
num_gpu_blocks
=
64
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
i
in
range
(
4
):
for
i
in
range
(
4
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
block_size
=
4
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -706,14 +846,21 @@ def test_schedule_swapped_max_seqs():
...
@@ -706,14 +846,21 @@ def test_schedule_swapped_max_seqs():
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_max_loras
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
def
test_schedule_swapped_max_loras
(
use_v2_block_manager
:
bool
):
block_size
=
4
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_loras
=
1
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
)
scheduler
=
initialize_scheduler
(
lora_config
=
lora_config
,
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
:
Set
[
int
]
=
set
()
curr_loras
:
Set
[
int
]
=
set
()
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
i
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
prompt_length
=
60
,
block_size
=
block_size
,
lora_request
=
LoRARequest
(
lora_request
=
LoRARequest
(
lora_name
=
str
(
i
),
lora_name
=
str
(
i
),
lora_int_id
=
i
+
1
,
lora_int_id
=
i
+
1
,
...
@@ -734,12 +881,20 @@ def test_schedule_swapped_max_loras():
...
@@ -734,12 +881,20 @@ def test_schedule_swapped_max_loras():
assert
len
(
curr_loras
)
==
1
assert
len
(
curr_loras
)
==
1
def
test_schedule_swapped_cannot_swap_in
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_schedule_swapped_cannot_swap_in
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -759,12 +914,20 @@ def test_schedule_swapped_cannot_swap_in():
...
@@ -759,12 +914,20 @@ def test_schedule_swapped_cannot_swap_in():
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_infeasible_swap
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_infeasible_swap
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
for
_
in
range
(
2
):
for
i
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
str
(
i
),
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
...
@@ -785,10 +948,18 @@ def test_infeasible_swap():
...
@@ -785,10 +948,18 @@ def test_infeasible_swap():
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_blocks_to_copy
():
@
pytest
.
mark
.
parametrize
(
'use_v2_block_manager'
,
[
True
,
False
])
scheduler
=
initialize_scheduler
()
def
test_schedule_swapped_blocks_to_copy
(
use_v2_block_manager
:
bool
):
block_size
=
4
scheduler
=
initialize_scheduler
(
use_v2_block_manager
=
use_v2_block_manager
,
block_size
=
block_size
,
num_cpu_blocks
=
32
,
num_gpu_blocks
=
32
)
curr_loras
=
None
curr_loras
=
None
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
,
block_size
=
block_size
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
blocks_to_swap_out
:
List
[
Tuple
[
int
,
int
]]
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment