Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
xdb4_94051
vllm
Commits
3d4ceb29
Unverified
Commit
3d4ceb29
authored
Nov 21, 2023
by
陈序
Committed by
GitHub
Nov 20, 2023
Browse files
Fix hanging in the scheduler caused by long prompts (#1534)
parent
f5a37c6c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
5 deletions
+36
-5
vllm/core/block_manager.py
vllm/core/block_manager.py
+24
-3
vllm/core/scheduler.py
vllm/core/scheduler.py
+12
-2
No files found.
vllm/core/block_manager.py
View file @
3d4ceb29
"""A block manager that manages token blocks."""
"""A block manager that manages token blocks."""
import
enum
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
from
vllm.block
import
PhysicalTokenBlock
from
vllm.block
import
PhysicalTokenBlock
...
@@ -54,6 +55,20 @@ class BlockAllocator:
...
@@ -54,6 +55,20 @@ class BlockAllocator:
BlockTable
=
List
[
PhysicalTokenBlock
]
BlockTable
=
List
[
PhysicalTokenBlock
]
class
AllocStatus
(
enum
.
Enum
):
"""Result for BlockSpaceManager.can_allocate
1. Ok: seq_group can be allocated now.
2. Later: seq_group cannot be allocated.
The capacity of allocator is larger than seq_group required.
3. Never: seq_group can never be allocated.
The seq_group is too large to allocated in GPU.
"""
OK
=
enum
.
auto
()
LATER
=
enum
.
auto
()
NEVER
=
enum
.
auto
()
class
BlockSpaceManager
:
class
BlockSpaceManager
:
"""Manages the mapping between logical and physical token blocks."""
"""Manages the mapping between logical and physical token blocks."""
...
@@ -86,7 +101,7 @@ class BlockSpaceManager:
...
@@ -86,7 +101,7 @@ class BlockSpaceManager:
# Mapping: seq_id -> BlockTable.
# Mapping: seq_id -> BlockTable.
self
.
block_tables
:
Dict
[
int
,
BlockTable
]
=
{}
self
.
block_tables
:
Dict
[
int
,
BlockTable
]
=
{}
def
can_allocate
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
def
can_allocate
(
self
,
seq_group
:
SequenceGroup
)
->
AllocStatus
:
# FIXME(woosuk): Here we assume that all sequences in the group share
# FIXME(woosuk): Here we assume that all sequences in the group share
# the same prompt. This may not be true for preempted sequences.
# the same prompt. This may not be true for preempted sequences.
seq
=
seq_group
.
get_seqs
()[
0
]
seq
=
seq_group
.
get_seqs
()[
0
]
...
@@ -95,9 +110,15 @@ class BlockSpaceManager:
...
@@ -95,9 +110,15 @@ class BlockSpaceManager:
num_required_blocks
=
min
(
num_required_blocks
,
num_required_blocks
=
min
(
num_required_blocks
,
self
.
block_sliding_window
)
self
.
block_sliding_window
)
num_free_gpu_blocks
=
self
.
gpu_allocator
.
get_num_free_blocks
()
num_free_gpu_blocks
=
self
.
gpu_allocator
.
get_num_free_blocks
()
# Use watermark to avoid frequent cache eviction.
# Use watermark to avoid frequent cache eviction.
return
(
num_free_gpu_blocks
-
num_required_blocks
>=
if
(
self
.
num_total_gpu_blocks
-
num_required_blocks
<
self
.
watermark_blocks
)
self
.
watermark_blocks
):
return
AllocStatus
.
NEVER
if
num_free_gpu_blocks
-
num_required_blocks
>=
self
.
watermark_blocks
:
return
AllocStatus
.
OK
else
:
return
AllocStatus
.
LATER
def
allocate
(
self
,
seq_group
:
SequenceGroup
)
->
None
:
def
allocate
(
self
,
seq_group
:
SequenceGroup
)
->
None
:
# NOTE: Here we assume that all sequences in the group have the same
# NOTE: Here we assume that all sequences in the group have the same
...
...
vllm/core/scheduler.py
View file @
3d4ceb29
...
@@ -3,7 +3,7 @@ import time
...
@@ -3,7 +3,7 @@ import time
from
typing
import
Dict
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
typing
import
Dict
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.block_manager
import
BlockSpaceManager
from
vllm.core.block_manager
import
AllocStatus
,
BlockSpaceManager
from
vllm.core.policy
import
PolicyFactory
from
vllm.core.policy
import
PolicyFactory
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.sequence
import
(
Sequence
,
SequenceData
,
SequenceGroup
,
from
vllm.sequence
import
(
Sequence
,
SequenceData
,
SequenceGroup
,
...
@@ -154,8 +154,18 @@ class Scheduler:
...
@@ -154,8 +154,18 @@ class Scheduler:
continue
continue
# If the sequence group cannot be allocated, stop.
# If the sequence group cannot be allocated, stop.
if
not
self
.
block_manager
.
can_allocate
(
seq_group
):
can_allocate
=
self
.
block_manager
.
can_allocate
(
seq_group
)
if
can_allocate
==
AllocStatus
.
LATER
:
break
break
elif
can_allocate
==
AllocStatus
.
NEVER
:
logger
.
warning
(
f
"Input prompt (
{
num_prompt_tokens
}
tokens) is too long"
f
" and exceeds the capacity of block_manager"
)
for
seq
in
seq_group
.
get_seqs
():
seq
.
status
=
SequenceStatus
.
FINISHED_IGNORED
ignored_seq_groups
.
append
(
seq_group
)
self
.
waiting
.
pop
(
0
)
continue
# If the number of batched tokens exceeds the limit, stop.
# If the number of batched tokens exceeds the limit, stop.
new_seq_lens
=
seq_lens
+
[
num_prompt_tokens
]
new_seq_lens
=
seq_lens
+
[
num_prompt_tokens
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment