Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0f8a9140
Unverified
Commit
0f8a9140
authored
May 03, 2024
by
SangBin Cho
Committed by
GitHub
May 02, 2024
Browse files
[Core] Ignore infeasible swap requests. (#4557)
parent
9b5c9f94
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
187 additions
and
42 deletions
+187
-42
tests/basic_correctness/test_preemption.py
tests/basic_correctness/test_preemption.py
+85
-0
tests/core/test_block_manager.py
tests/core/test_block_manager.py
+1
-1
tests/core/test_chunked_prefill_scheduler.py
tests/core/test_chunked_prefill_scheduler.py
+3
-2
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+29
-1
vllm/core/block/cpu_gpu_block_allocator.py
vllm/core/block/cpu_gpu_block_allocator.py
+8
-11
vllm/core/block/interfaces.py
vllm/core/block/interfaces.py
+13
-8
vllm/core/block/naive_block.py
vllm/core/block/naive_block.py
+4
-2
vllm/core/block/prefix_caching_block.py
vllm/core/block/prefix_caching_block.py
+3
-0
vllm/core/block_manager_v1.py
vllm/core/block_manager_v1.py
+17
-2
vllm/core/block_manager_v2.py
vllm/core/block_manager_v2.py
+2
-2
vllm/core/interfaces.py
vllm/core/interfaces.py
+1
-1
vllm/core/scheduler.py
vllm/core/scheduler.py
+21
-12
No files found.
tests/basic_correctness/test_preemption.py
View file @
0f8a9140
...
@@ -7,6 +7,7 @@ pytest tests/basic_correctness/test_preemption.py`.
...
@@ -7,6 +7,7 @@ pytest tests/basic_correctness/test_preemption.py`.
"""
"""
import
pytest
import
pytest
from
vllm
import
SamplingParams
from
vllm.core.scheduler
import
(
ARTIFICIAL_PREEMPTION_MAX_CNT
,
from
vllm.core.scheduler
import
(
ARTIFICIAL_PREEMPTION_MAX_CNT
,
ENABLE_ARTIFICIAL_PREEMPT
)
ENABLE_ARTIFICIAL_PREEMPT
)
...
@@ -136,3 +137,87 @@ def test_swap(
...
@@ -136,3 +137,87 @@ def test_swap(
assert
hf_output_ids
[
j
]
==
vllm_output_ids
[
j
],
(
assert
hf_output_ids
[
j
]
==
vllm_output_ids
[
j
],
(
f
"Test
{
i
}
output
{
j
}
:
\n
HF:
{
hf_output_ids
}
\n
"
f
"Test
{
i
}
output
{
j
}
:
\n
HF:
{
hf_output_ids
}
\n
"
f
"vLLM:
{
vllm_output_ids
}
"
)
f
"vLLM:
{
vllm_output_ids
}
"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"beam_width"
,
[
4
])
def
test_swap_infeasible
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
beam_width
:
int
,
)
->
None
:
"""Verify infeasible swap request will be ignored."""
BLOCK_SIZE
=
16
prefill_blocks
=
2
decode_blocks
=
max_tokens
//
BLOCK_SIZE
example_prompts
=
example_prompts
[:
1
]
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
swap_space
=
10
,
block_size
=
BLOCK_SIZE
,
# Since beam search have more than 1 sequence, prefill + decode blocks
# are not enough to finish.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
,
max_model_len
=
(
prefill_blocks
+
decode_blocks
)
*
BLOCK_SIZE
,
)
sampling_params
=
SamplingParams
(
n
=
beam_width
,
use_beam_search
=
True
,
temperature
=
0.0
,
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
req_outputs
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
sampling_params
,
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
del
vllm_model
# Verify the request is ignored and not hang.
assert
req_outputs
[
0
].
outputs
[
0
].
finish_reason
==
"length"
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
96
])
def
test_preemption_infeasible
(
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE
=
16
prefill_blocks
=
2
decode_blocks
=
max_tokens
//
BLOCK_SIZE
vllm_model
=
vllm_runner
(
model
,
dtype
=
dtype
,
block_size
=
BLOCK_SIZE
,
# Not enough gpu blocks to complete a single sequence.
# preemption should happen, and the sequence should be
# ignored instead of hanging forever.
num_gpu_blocks_override
=
prefill_blocks
+
decode_blocks
//
2
,
max_model_len
=
((
prefill_blocks
+
decode_blocks
//
2
)
*
BLOCK_SIZE
),
)
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
ignore_eos
=
True
)
req_outputs
=
vllm_model
.
model
.
generate
(
example_prompts
,
sampling_params
=
sampling_params
,
)
assert
(
vllm_model
.
model
.
llm_engine
.
scheduler
.
artificial_preempt_cnt
<
ARTIFICIAL_PREEMPTION_MAX_CNT
)
del
vllm_model
# Verify the request is ignored and not hang.
for
req_output
in
req_outputs
:
outputs
=
req_output
.
outputs
assert
len
(
outputs
)
==
1
assert
outputs
[
0
].
finish_reason
==
"length"
tests/core/test_block_manager.py
View file @
0f8a9140
...
@@ -224,7 +224,7 @@ def test_swap():
...
@@ -224,7 +224,7 @@ def test_swap():
# Swap seq group from CPU -> GPU.
# Swap seq group from CPU -> GPU.
cpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
cpu_blocks
=
block_manager
.
get_block_table
(
prompt
)
assert
block_manager
.
can_swap_in
(
seq_group
)
assert
block_manager
.
can_swap_in
(
seq_group
)
==
AllocStatus
.
OK
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_cpu_blocks
=
block_manager
.
get_num_free_cpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
before_gpu_blocks
=
block_manager
.
get_num_free_gpu_blocks
()
mapping
=
block_manager
.
swap_in
(
seq_group
)
mapping
=
block_manager
.
swap_in
(
seq_group
)
...
...
tests/core/test_chunked_prefill_scheduler.py
View file @
0f8a9140
...
@@ -4,6 +4,7 @@ from unittest.mock import MagicMock
...
@@ -4,6 +4,7 @@ from unittest.mock import MagicMock
import
pytest
# noqa
import
pytest
# noqa
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.scheduler
import
Scheduler
from
vllm.core.scheduler
import
Scheduler
from
vllm.sequence
import
Logprob
,
SequenceGroup
from
vllm.sequence
import
Logprob
,
SequenceGroup
...
@@ -410,7 +411,7 @@ def test_running_prefill_prioritized_over_swap():
...
@@ -410,7 +411,7 @@ def test_running_prefill_prioritized_over_swap():
# Add 1 more task. Swap is not possible, so prefill is running.
# Add 1 more task. Swap is not possible, so prefill is running.
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
False
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
LATER
_
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
)
_
,
seq_group2
=
create_dummy_prompt
(
"2"
,
prompt_length
=
60
)
scheduler
.
add_seq_group
(
seq_group2
)
scheduler
.
add_seq_group
(
seq_group2
)
...
@@ -423,7 +424,7 @@ def test_running_prefill_prioritized_over_swap():
...
@@ -423,7 +424,7 @@ def test_running_prefill_prioritized_over_swap():
assert
out
.
scheduled_seq_groups
[
0
].
seq_group
==
seq_group2
assert
out
.
scheduled_seq_groups
[
0
].
seq_group
==
seq_group2
# Now although swap is possible, running prefill is prioritized.
# Now although swap is possible, running prefill is prioritized.
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
True
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
OK
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
_
,
out
=
schedule_and_update_computed_tokens
(
scheduler
)
assert
len
(
out
.
scheduled_seq_groups
)
==
1
assert
len
(
out
.
scheduled_seq_groups
)
==
1
# 3 decodes. It is swapped in.
# 3 decodes. It is swapped in.
...
...
tests/core/test_scheduler.py
View file @
0f8a9140
...
@@ -791,7 +791,7 @@ def test_schedule_swapped_cannot_swap_in():
...
@@ -791,7 +791,7 @@ def test_schedule_swapped_cannot_swap_in():
# The last request should be swapped out.
# The last request should be swapped out.
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
False
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
LATER
# Since we cannot swap in, none of the requests are swapped in.
# Since we cannot swap in, none of the requests are swapped in.
budget
=
create_token_budget
()
budget
=
create_token_budget
()
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
...
@@ -803,6 +803,34 @@ def test_schedule_swapped_cannot_swap_in():
...
@@ -803,6 +803,34 @@ def test_schedule_swapped_cannot_swap_in():
assert
len
(
output
.
prefill_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_infeasible_swap
():
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
curr_loras
=
None
blocks_to_swap_out
=
{}
for
_
in
range
(
2
):
_
,
seq_group
=
create_dummy_prompt
(
"1"
,
prompt_length
=
60
,
best_of
=
2
)
scheduler
.
_allocate_and_set_running
(
seq_group
)
append_new_token_seq_group
(
60
,
seq_group
,
1
)
scheduler
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
swapped
.
append
(
seq_group
)
# The last request should be swapped out.
scheduler
.
block_manager
.
can_swap_in
=
MagicMock
()
scheduler
.
block_manager
.
can_swap_in
.
return_value
=
AllocStatus
.
NEVER
# Since we cannot swap in, none of the requests are swapped in.
budget
=
create_token_budget
()
remaining_swapped
,
output
=
scheduler
.
_schedule_swapped
(
swapped
,
budget
,
curr_loras
,
policy
)
assert
len
(
remaining_swapped
)
==
0
assert
len
(
output
.
infeasible_seq_groups
)
==
2
assert
budget
.
num_batched_tokens
==
0
assert
budget
.
num_curr_seqs
==
0
assert
len
(
output
.
decode_seq_groups
)
==
0
assert
len
(
output
.
prefill_seq_groups
)
==
0
def
test_schedule_swapped_blocks_to_copy
():
def
test_schedule_swapped_blocks_to_copy
():
scheduler
=
initialize_scheduler
()
scheduler
=
initialize_scheduler
()
swapped
=
deque
()
swapped
=
deque
()
...
...
vllm/core/block/cpu_gpu_block_allocator.py
View file @
0f8a9140
...
@@ -110,9 +110,8 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
...
@@ -110,9 +110,8 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
for
block_id
in
allocator
.
all_block_ids
:
for
block_id
in
allocator
.
all_block_ids
:
self
.
_block_ids_to_allocator
[
block_id
]
=
allocator
self
.
_block_ids_to_allocator
[
block_id
]
=
allocator
def
allocate_mutable
(
self
,
def
allocate_mutable
(
self
,
prev_block
:
Optional
[
Block
],
prev_block
:
Optional
[
Block
],
device
:
Device
)
->
Block
:
device
:
Optional
[
Device
]
=
None
)
->
Block
:
"""Allocates a new mutable block on the specified device.
"""Allocates a new mutable block on the specified device.
Args:
Args:
...
@@ -123,13 +122,10 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
...
@@ -123,13 +122,10 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
Returns:
Returns:
Block: The newly allocated mutable block.
Block: The newly allocated mutable block.
"""
"""
assert
device
is
not
None
return
self
.
_allocators
[
device
].
allocate_mutable
(
prev_block
)
return
self
.
_allocators
[
device
].
allocate_mutable
(
prev_block
)
def
allocate_immutable
(
self
,
def
allocate_immutable
(
self
,
prev_block
:
Optional
[
Block
],
prev_block
:
Optional
[
Block
],
token_ids
:
List
[
int
],
device
:
Device
)
->
Block
:
token_ids
:
List
[
int
],
device
:
Optional
[
Device
]
=
None
)
->
Block
:
"""Allocates a new immutable block with the provided token IDs on the
"""Allocates a new immutable block with the provided token IDs on the
specified device.
specified device.
...
@@ -144,7 +140,6 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
...
@@ -144,7 +140,6 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
Block: The newly allocated immutable block containing the provided
Block: The newly allocated immutable block containing the provided
token IDs.
token IDs.
"""
"""
assert
device
is
not
None
return
self
.
_allocators
[
device
].
allocate_immutable
(
return
self
.
_allocators
[
device
].
allocate_immutable
(
prev_block
,
token_ids
)
prev_block
,
token_ids
)
...
@@ -175,7 +170,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
...
@@ -175,7 +170,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
allocator
=
self
.
_block_ids_to_allocator
[
block_id
]
allocator
=
self
.
_block_ids_to_allocator
[
block_id
]
return
allocator
.
fork
(
last_block
)
return
allocator
.
fork
(
last_block
)
def
get_num_free_blocks
(
self
,
device
:
Optional
[
Device
]
=
Non
e
)
->
int
:
def
get_num_free_blocks
(
self
,
device
:
Devic
e
)
->
int
:
"""Returns the number of free blocks available on the specified device.
"""Returns the number of free blocks available on the specified device.
Args:
Args:
...
@@ -185,9 +180,11 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
...
@@ -185,9 +180,11 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
Returns:
Returns:
int: The number of free blocks available on the specified device.
int: The number of free blocks available on the specified device.
"""
"""
assert
device
is
not
None
return
self
.
_allocators
[
device
].
get_num_free_blocks
()
return
self
.
_allocators
[
device
].
get_num_free_blocks
()
def
get_num_total_blocks
(
self
,
device
:
Device
)
->
int
:
return
self
.
_allocators
[
device
].
get_num_total_blocks
()
def
clear_copy_on_writes
(
self
)
->
Dict
[
int
,
List
[
int
]]:
def
clear_copy_on_writes
(
self
)
->
Dict
[
int
,
List
[
int
]]:
"""Clears the copy-on-write (CoW) state and returns the mapping of
"""Clears the copy-on-write (CoW) state and returns the mapping of
source to destination block IDs.
source to destination block IDs.
...
...
vllm/core/block/interfaces.py
View file @
0f8a9140
...
@@ -108,6 +108,10 @@ class BlockAllocator(ABC):
...
@@ -108,6 +108,10 @@ class BlockAllocator(ABC):
def
fork
(
self
,
last_block
:
Block
)
->
List
[
Block
]:
def
fork
(
self
,
last_block
:
Block
)
->
List
[
Block
]:
pass
pass
@
abstractmethod
def
get_num_total_blocks
(
self
)
->
int
:
pass
@
abstractmethod
@
abstractmethod
def
get_num_free_blocks
(
self
)
->
int
:
def
get_num_free_blocks
(
self
)
->
int
:
pass
pass
...
@@ -152,20 +156,21 @@ class BlockAllocator(ABC):
...
@@ -152,20 +156,21 @@ class BlockAllocator(ABC):
class
DeviceAwareBlockAllocator
(
ABC
):
class
DeviceAwareBlockAllocator
(
ABC
):
@
abstractmethod
@
abstractmethod
def
allocate_mutable
(
self
,
def
allocate_mutable
(
self
,
prev_block
:
Optional
[
Block
],
prev_block
:
Optional
[
Block
],
device
:
Device
)
->
Block
:
device
:
Optional
[
Device
]
=
None
)
->
Block
:
pass
@
abstractmethod
def
allocate_immutable
(
self
,
prev_block
:
Optional
[
Block
],
token_ids
:
List
[
int
],
device
:
Device
)
->
Block
:
pass
pass
@
abstractmethod
@
abstractmethod
def
allocate_immutable
(
self
,
def
get_num_free_blocks
(
self
,
device
:
Device
)
->
int
:
prev_block
:
Optional
[
Block
],
token_ids
:
List
[
int
],
device
:
Optional
[
Device
]
=
None
)
->
Block
:
pass
pass
@
abstractmethod
@
abstractmethod
def
get_num_
free
_blocks
(
self
,
device
:
Optional
[
Device
]
=
Non
e
)
->
int
:
def
get_num_
total
_blocks
(
self
,
device
:
Devic
e
)
->
int
:
pass
pass
@
abstractmethod
@
abstractmethod
...
...
vllm/core/block/naive_block.py
View file @
0f8a9140
...
@@ -133,10 +133,12 @@ class NaiveBlockAllocator(BlockAllocator):
...
@@ -133,10 +133,12 @@ class NaiveBlockAllocator(BlockAllocator):
return
forked_blocks
return
forked_blocks
def
get_num_free_blocks
(
self
,
device
:
Optional
[
Device
]
=
None
)
->
int
:
def
get_num_free_blocks
(
self
)
->
int
:
assert
device
is
None
return
len
(
self
.
_free_block_indices
)
return
len
(
self
.
_free_block_indices
)
def
get_num_total_blocks
(
self
)
->
int
:
return
len
(
self
.
_all_block_indices
)
def
_allocate_new_block_id
(
self
)
->
BlockId
:
def
_allocate_new_block_id
(
self
)
->
BlockId
:
if
not
self
.
_free_block_indices
:
if
not
self
.
_free_block_indices
:
raise
BlockAllocator
.
NoFreeBlocksError
()
raise
BlockAllocator
.
NoFreeBlocksError
()
...
...
vllm/core/block/prefix_caching_block.py
View file @
0f8a9140
...
@@ -285,6 +285,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
...
@@ -285,6 +285,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
return
self
.
_hashless_allocator
.
get_num_free_blocks
(
return
self
.
_hashless_allocator
.
get_num_free_blocks
(
)
+
self
.
evictor
.
num_blocks
)
+
self
.
evictor
.
num_blocks
def
get_num_total_blocks
(
self
)
->
int
:
return
self
.
_hashless_allocator
.
get_num_total_blocks
()
@
property
@
property
def
all_block_ids
(
self
)
->
FrozenSet
[
int
]:
def
all_block_ids
(
self
)
->
FrozenSet
[
int
]:
return
self
.
_hashless_allocator
.
all_block_ids
return
self
.
_hashless_allocator
.
all_block_ids
...
...
vllm/core/block_manager_v1.py
View file @
0f8a9140
...
@@ -47,6 +47,10 @@ class BlockAllocatorBase(ABC):
...
@@ -47,6 +47,10 @@ class BlockAllocatorBase(ABC):
def
get_num_free_blocks
(
self
)
->
int
:
def
get_num_free_blocks
(
self
)
->
int
:
pass
pass
@
abstractmethod
def
get_num_total_blocks
(
self
)
->
int
:
pass
@
abstractmethod
@
abstractmethod
def
contains_block
(
self
,
block_hash
:
int
)
->
bool
:
def
contains_block
(
self
,
block_hash
:
int
)
->
bool
:
pass
pass
...
@@ -131,6 +135,9 @@ class CachedBlockAllocator(BlockAllocatorBase):
...
@@ -131,6 +135,9 @@ class CachedBlockAllocator(BlockAllocatorBase):
return
(
self
.
num_blocks
-
self
.
current_num_blocks
+
return
(
self
.
num_blocks
-
self
.
current_num_blocks
+
self
.
evictor
.
num_blocks
)
self
.
evictor
.
num_blocks
)
def
get_num_total_blocks
(
self
)
->
int
:
return
self
.
num_blocks
def
contains_block
(
self
,
block_hash
:
int
)
->
bool
:
def
contains_block
(
self
,
block_hash
:
int
)
->
bool
:
return
block_hash
in
self
.
cached_blocks
or
block_hash
in
self
.
evictor
return
block_hash
in
self
.
cached_blocks
or
block_hash
in
self
.
evictor
...
@@ -190,6 +197,9 @@ class UncachedBlockAllocator(BlockAllocatorBase):
...
@@ -190,6 +197,9 @@ class UncachedBlockAllocator(BlockAllocatorBase):
def
get_num_free_blocks
(
self
)
->
int
:
def
get_num_free_blocks
(
self
)
->
int
:
return
len
(
self
.
free_blocks
)
return
len
(
self
.
free_blocks
)
def
get_num_total_blocks
(
self
)
->
int
:
return
self
.
num_blocks
def
contains_block
(
self
,
block_hash
:
int
)
->
bool
:
def
contains_block
(
self
,
block_hash
:
int
)
->
bool
:
raise
NotImplementedError
(
raise
NotImplementedError
(
"Invalid codepath for uncached block allocator."
)
"Invalid codepath for uncached block allocator."
)
...
@@ -444,7 +454,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
...
@@ -444,7 +454,7 @@ class BlockSpaceManagerV1(BlockSpaceManager):
def
can_swap_in
(
self
,
def
can_swap_in
(
self
,
seq_group
:
SequenceGroup
,
seq_group
:
SequenceGroup
,
num_lookahead_slots
:
int
=
0
)
->
bool
:
num_lookahead_slots
:
int
=
0
)
->
AllocStatus
:
assert
(
num_lookahead_slots
==
0
assert
(
num_lookahead_slots
==
0
),
"BlockSpaceManagerV1 does not support lookahead allocation"
),
"BlockSpaceManagerV1 does not support lookahead allocation"
blocks
=
self
.
_get_physical_blocks
(
seq_group
)
blocks
=
self
.
_get_physical_blocks
(
seq_group
)
...
@@ -454,7 +464,12 @@ class BlockSpaceManagerV1(BlockSpaceManager):
...
@@ -454,7 +464,12 @@ class BlockSpaceManagerV1(BlockSpaceManager):
# at least one free block right after the swap-in.
# at least one free block right after the swap-in.
# NOTE: This should match the logic in can_append_slot().
# NOTE: This should match the logic in can_append_slot().
num_required_blocks
=
len
(
blocks
)
+
num_swapped_seqs
num_required_blocks
=
len
(
blocks
)
+
num_swapped_seqs
return
num_free_blocks
-
num_required_blocks
>=
self
.
watermark_blocks
if
self
.
gpu_allocator
.
get_num_total_blocks
()
<
num_required_blocks
:
return
AllocStatus
.
NEVER
elif
num_free_blocks
-
num_required_blocks
>=
self
.
watermark_blocks
:
return
AllocStatus
.
OK
else
:
return
AllocStatus
.
LATER
def
swap_in
(
self
,
def
swap_in
(
self
,
seq_group
:
SequenceGroup
,
seq_group
:
SequenceGroup
,
...
...
vllm/core/block_manager_v2.py
View file @
0f8a9140
...
@@ -238,8 +238,8 @@ class BlockSpaceManagerV2(BlockSpaceManager):
...
@@ -238,8 +238,8 @@ class BlockSpaceManagerV2(BlockSpaceManager):
self
.
block_tables
[
child_seq
.
seq_id
]
=
src_block_table
.
fork
()
self
.
block_tables
[
child_seq
.
seq_id
]
=
src_block_table
.
fork
()
def
can_swap_in
(
self
,
seq_group
:
SequenceGroup
,
def
can_swap_in
(
self
,
seq_group
:
SequenceGroup
,
num_lookahead_slots
:
int
)
->
bool
:
num_lookahead_slots
:
int
)
->
AllocStatus
:
return
False
return
AllocStatus
.
LATER
def
swap_in
(
self
,
seq_group
:
SequenceGroup
,
def
swap_in
(
self
,
seq_group
:
SequenceGroup
,
num_lookahead_slots
:
int
)
->
Dict
[
int
,
int
]:
num_lookahead_slots
:
int
)
->
Dict
[
int
,
int
]:
...
...
vllm/core/interfaces.py
View file @
0f8a9140
...
@@ -63,7 +63,7 @@ class BlockSpaceManager(ABC):
...
@@ -63,7 +63,7 @@ class BlockSpaceManager(ABC):
@
abstractmethod
@
abstractmethod
def
can_swap_in
(
self
,
seq_group
:
SequenceGroup
,
def
can_swap_in
(
self
,
seq_group
:
SequenceGroup
,
num_lookahead_slots
:
int
)
->
bool
:
num_lookahead_slots
:
int
)
->
AllocStatus
:
pass
pass
@
abstractmethod
@
abstractmethod
...
...
vllm/core/scheduler.py
View file @
0f8a9140
...
@@ -210,6 +210,8 @@ class SchedulerSwappedInOutputs:
...
@@ -210,6 +210,8 @@ class SchedulerSwappedInOutputs:
blocks_to_copy
:
Dict
[
int
,
List
[
int
]]
blocks_to_copy
:
Dict
[
int
,
List
[
int
]]
# The number of slots for lookahead decoding.
# The number of slots for lookahead decoding.
num_lookahead_slots
:
int
num_lookahead_slots
:
int
# Infeasible sequence groups.
infeasible_seq_groups
:
List
[
SequenceGroup
]
@
classmethod
@
classmethod
def
create_empty
(
cls
)
->
"SchedulerSwappedInOutputs"
:
def
create_empty
(
cls
)
->
"SchedulerSwappedInOutputs"
:
...
@@ -219,6 +221,7 @@ class SchedulerSwappedInOutputs:
...
@@ -219,6 +221,7 @@ class SchedulerSwappedInOutputs:
blocks_to_swap_in
=
{},
blocks_to_swap_in
=
{},
blocks_to_copy
=
{},
blocks_to_copy
=
{},
num_lookahead_slots
=
0
,
num_lookahead_slots
=
0
,
infeasible_seq_groups
=
[],
)
)
...
@@ -511,14 +514,26 @@ class Scheduler:
...
@@ -511,14 +514,26 @@ class Scheduler:
prefill_seq_groups
:
List
[
ScheduledSequenceGroup
]
=
[]
prefill_seq_groups
:
List
[
ScheduledSequenceGroup
]
=
[]
now
=
time
.
time
()
now
=
time
.
time
()
swapped_queue
=
policy
.
sort_by_priority
(
now
,
swapped_queue
)
swapped_queue
=
policy
.
sort_by_priority
(
now
,
swapped_queue
)
infeasible_seq_groups
:
List
[
SequenceGroup
]
=
[]
leftover_swapped
:
Deque
[
SequenceGroup
]
=
deque
()
leftover_swapped
:
Deque
[
SequenceGroup
]
=
deque
()
while
swapped_queue
:
while
swapped_queue
:
seq_group
=
swapped_queue
[
0
]
seq_group
=
swapped_queue
[
0
]
# If the sequence group cannot be swapped in, stop.
# If the sequence group cannot be swapped in, stop.
if
not
self
.
block_manager
.
can_swap_in
(
seq_group
):
alloc_status
=
self
.
block_manager
.
can_swap_in
(
seq_group
)
if
alloc_status
==
AllocStatus
.
LATER
:
break
break
elif
alloc_status
==
AllocStatus
.
NEVER
:
logger
.
warning
(
"Failing the request %s because there's not enough kv "
"cache blocks to run the entire sequence."
,
seq_group
.
request_id
)
for
seq
in
seq_group
.
get_seqs
():
seq
.
status
=
SequenceStatus
.
FINISHED_IGNORED
infeasible_seq_groups
.
append
(
seq_group
)
swapped_queue
.
popleft
()
continue
lora_int_id
=
0
lora_int_id
=
0
if
self
.
lora_enabled
:
if
self
.
lora_enabled
:
...
@@ -569,7 +584,9 @@ class Scheduler:
...
@@ -569,7 +584,9 @@ class Scheduler:
blocks_to_swap_in
=
blocks_to_swap_in
,
blocks_to_swap_in
=
blocks_to_swap_in
,
blocks_to_copy
=
blocks_to_copy
,
blocks_to_copy
=
blocks_to_copy
,
num_lookahead_slots
=
self
.
_get_num_lookahead_slots
(
num_lookahead_slots
=
self
.
_get_num_lookahead_slots
(
is_prefill
=
False
))
is_prefill
=
False
),
infeasible_seq_groups
=
infeasible_seq_groups
,
)
def
_schedule_prefills
(
def
_schedule_prefills
(
self
,
self
,
...
@@ -777,7 +794,8 @@ class Scheduler:
...
@@ -777,7 +794,8 @@ class Scheduler:
blocks_to_swap_out
=
running_scheduled
.
blocks_to_swap_out
,
blocks_to_swap_out
=
running_scheduled
.
blocks_to_swap_out
,
blocks_to_copy
=
merge_dicts
(
running_scheduled
.
blocks_to_copy
,
blocks_to_copy
=
merge_dicts
(
running_scheduled
.
blocks_to_copy
,
swapped_in
.
blocks_to_copy
),
swapped_in
.
blocks_to_copy
),
ignored_seq_groups
=
prefills
.
ignored_seq_groups
,
ignored_seq_groups
=
prefills
.
ignored_seq_groups
+
swapped_in
.
infeasible_seq_groups
,
num_lookahead_slots
=
running_scheduled
.
num_lookahead_slots
,
num_lookahead_slots
=
running_scheduled
.
num_lookahead_slots
,
)
)
...
@@ -893,15 +911,6 @@ class Scheduler:
...
@@ -893,15 +911,6 @@ class Scheduler:
num_lookahead_slots
=
self
.
_get_num_lookahead_slots
(
is_prefill
),
num_lookahead_slots
=
self
.
_get_num_lookahead_slots
(
is_prefill
),
)
)
def
_can_swap_in
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
# Swapping in is considered decode.
is_prefill
=
False
return
self
.
block_manager
.
can_swap_in
(
seq_group
=
seq_group
,
num_lookahead_slots
=
self
.
_get_num_lookahead_slots
(
is_prefill
),
)
def
schedule
(
self
)
->
Tuple
[
List
[
SequenceGroupMetadata
],
SchedulerOutputs
]:
def
schedule
(
self
)
->
Tuple
[
List
[
SequenceGroupMetadata
],
SchedulerOutputs
]:
# Schedule sequence groups.
# Schedule sequence groups.
# This function call changes the internal states of the scheduler
# This function call changes the internal states of the scheduler
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment