Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7206ce4c
Unverified
Commit
7206ce4c
authored
Jan 22, 2025
by
Cody Yu
Committed by
GitHub
Jan 22, 2025
Browse files
[Core] Support `reset_prefix_cache` (#12284)
parent
96f6a759
Changes
27
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
228 additions
and
16 deletions
+228
-16
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+38
-0
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_prefix_caching.py
+39
-0
vllm/core/block/cpu_gpu_block_allocator.py
vllm/core/block/cpu_gpu_block_allocator.py
+7
-0
vllm/core/block/interfaces.py
vllm/core/block/interfaces.py
+10
-0
vllm/core/block/naive_block.py
vllm/core/block/naive_block.py
+14
-5
vllm/core/block/prefix_caching_block.py
vllm/core/block/prefix_caching_block.py
+43
-1
vllm/core/block_manager.py
vllm/core/block_manager.py
+3
-0
vllm/core/interfaces.py
vllm/core/interfaces.py
+5
-0
vllm/core/placeholder_block_space_manager.py
vllm/core/placeholder_block_space_manager.py
+3
-0
vllm/core/scheduler.py
vllm/core/scheduler.py
+3
-0
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+3
-0
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+8
-0
vllm/engine/multiprocessing/__init__.py
vllm/engine/multiprocessing/__init__.py
+6
-1
vllm/engine/multiprocessing/client.py
vllm/engine/multiprocessing/client.py
+10
-2
vllm/engine/multiprocessing/engine.py
vllm/engine/multiprocessing/engine.py
+8
-2
vllm/engine/protocol.py
vllm/engine/protocol.py
+5
-0
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+4
-0
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+12
-0
vllm/envs.py
vllm/envs.py
+7
-0
vllm/executor/executor_base.py
vllm/executor/executor_base.py
+0
-5
No files found.
tests/core/block/test_prefix_caching_block.py
View file @
7206ce4c
...
@@ -796,6 +796,44 @@ class TestPrefixCachingBlockAllocator:
...
@@ -796,6 +796,44 @@ class TestPrefixCachingBlockAllocator:
block_hashes
=
block_hashes_seq1
)
block_hashes
=
block_hashes_seq1
)
assert
len
(
cached_blocks
)
==
len
(
blocks_seq1
)
-
num_evicted_blocks
assert
len
(
cached_blocks
)
==
len
(
blocks_seq1
)
-
num_evicted_blocks
# Test reset prefix cache
@
staticmethod
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
[
10
])
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
def
test_reset_prefix_cache
(
num_blocks
:
int
,
block_size
:
int
):
"""This test case simulates the case of resetting the prefix cache."""
allocator
=
PrefixCachingBlockAllocator
(
num_blocks
=
num_blocks
,
block_size
=
block_size
)
token_ids
=
list
(
range
(
3
*
block_size
))
first_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
second_chain
=
TestPrefixCachingBlockAllocator
.
create_immutable_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
allocator
=
allocator
,
)
# Free each block in the first chain.
for
block
in
first_chain
:
allocator
.
free
(
block
)
# Failed to reset prefix cache because some blocks are not freed yet.
assert
not
allocator
.
reset_prefix_cache
()
assert
allocator
.
get_prefix_cache_hit_rate
()
>
0.0
# Free each block in the second chain.
for
block
in
second_chain
:
allocator
.
free
(
block
)
# Reset prefix cache.
assert
allocator
.
reset_prefix_cache
()
assert
allocator
.
get_prefix_cache_hit_rate
()
==
0.0
@
staticmethod
@
staticmethod
def
create_immutable_chain
(
def
create_immutable_chain
(
block_size
:
int
,
block_size
:
int
,
...
...
tests/v1/core/test_prefix_caching.py
View file @
7206ce4c
...
@@ -587,3 +587,42 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
...
@@ -587,3 +587,42 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
assert
{
block
.
ref_cnt
for
block
in
block_part1
[:
3
]}
==
{
1
}
assert
{
block
.
ref_cnt
for
block
in
block_part1
[:
3
]}
==
{
1
}
# Block 3-5 are free.
# Block 3-5 are free.
assert
{
block
.
ref_cnt
for
block
in
block_part1
[
3
:]}
==
{
0
}
assert
{
block
.
ref_cnt
for
block
in
block_part1
[
3
:]}
==
{
0
}
def
test_reset_prefix_cache
():
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
full_block_token_ids
=
[
i
for
i
in
range
(
3
)
for
_
in
range
(
16
)]
unique_token_ids
=
[
3
]
*
7
all_token_ids
=
full_block_token_ids
+
unique_token_ids
req0
=
make_request
(
"0"
,
all_token_ids
)
blocks
=
manager
.
allocate_slots
(
req0
,
55
,
[])
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
0
,
1
,
2
,
3
]
unique_token_ids
=
[
4
]
*
7
all_token_ids
=
full_block_token_ids
+
unique_token_ids
req1
=
make_request
(
"1"
,
all_token_ids
)
computed_blocks
,
_
=
manager
.
get_computed_blocks
(
req1
)
assert
len
(
req1
.
kv_block_hashes
)
==
3
assert
len
(
computed_blocks
)
==
3
blocks
=
manager
.
allocate_slots
(
req1
,
7
,
computed_blocks
)
assert
[
b
.
block_id
for
b
in
blocks
]
==
[
4
]
# Failed to reset prefix cache because some blocks are not freed yet.
assert
not
manager
.
reset_prefix_cache
()
assert
manager
.
cached_block_hash_to_block
# Free the blocks.
manager
.
free
(
req0
)
manager
.
free
(
req1
)
assert
manager
.
reset_prefix_cache
()
assert
not
manager
.
cached_block_hash_to_block
assert
all
([
blk
.
block_hash
is
None
for
blk
in
manager
.
block_pool
])
vllm/core/block/cpu_gpu_block_allocator.py
View file @
7206ce4c
...
@@ -339,6 +339,13 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
...
@@ -339,6 +339,13 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
assert
device
in
self
.
_allocators
assert
device
in
self
.
_allocators
return
self
.
_allocators
[
device
].
get_prefix_cache_hit_rate
()
return
self
.
_allocators
[
device
].
get_prefix_cache_hit_rate
()
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache for all devices."""
success
=
True
for
allocator
in
self
.
_allocators
.
values
():
success
=
success
and
allocator
.
reset_prefix_cache
()
return
success
def
get_and_reset_swaps
(
self
)
->
List
[
Tuple
[
int
,
int
]]:
def
get_and_reset_swaps
(
self
)
->
List
[
Tuple
[
int
,
int
]]:
"""Returns and clears the mapping of source to destination block IDs.
"""Returns and clears the mapping of source to destination block IDs.
Will be called after every swapping operations for now, and after every
Will be called after every swapping operations for now, and after every
...
...
vllm/core/block/interfaces.py
View file @
7206ce4c
...
@@ -192,6 +192,11 @@ class BlockAllocator(ABC):
...
@@ -192,6 +192,11 @@ class BlockAllocator(ABC):
"""Prefix cache hit rate. -1 means not supported or disabled."""
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
pass
@
abstractmethod
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache."""
pass
class
NoFreeBlocksError
(
ValueError
):
class
NoFreeBlocksError
(
ValueError
):
pass
pass
...
@@ -297,6 +302,11 @@ class DeviceAwareBlockAllocator(ABC):
...
@@ -297,6 +302,11 @@ class DeviceAwareBlockAllocator(ABC):
"""Prefix cache hit rate. -1 means not supported or disabled."""
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
pass
@
abstractmethod
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache."""
pass
@
abstractmethod
@
abstractmethod
def
find_cached_blocks_prefix
(
def
find_cached_blocks_prefix
(
self
,
self
,
...
...
vllm/core/block/naive_block.py
View file @
7206ce4c
from
collections
import
deque
from
collections
import
deque
from
typing
import
Deque
,
FrozenSet
,
Iterable
,
List
,
Optional
,
Tuple
from
typing
import
Deque
,
FrozenSet
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
vllm.core.block.common
import
(
BlockPool
,
CopyOnWriteTracker
,
RefCounter
,
from
vllm.core.block.common
import
(
BlockPool
,
CopyOnWriteTracker
,
RefCounter
,
get_all_blocks_recursively
)
get_all_blocks_recursively
)
...
@@ -136,16 +136,18 @@ class NaiveBlockAllocator(BlockAllocator):
...
@@ -136,16 +136,18 @@ class NaiveBlockAllocator(BlockAllocator):
self
.
_refcounter
.
incr
(
block_id
)
self
.
_refcounter
.
incr
(
block_id
)
return
block_id
return
block_id
def
_free_block_id
(
self
,
block
:
Block
)
->
None
:
def
_free_block_id
(
self
,
block
:
Union
[
Block
,
BlockId
])
->
None
:
if
isinstance
(
block
,
Block
):
block_id
=
block
.
block_id
block_id
=
block
.
block_id
block
.
block_id
=
None
else
:
block_id
=
block
assert
block_id
is
not
None
assert
block_id
is
not
None
refcount
=
self
.
_refcounter
.
decr
(
block_id
)
refcount
=
self
.
_refcounter
.
decr
(
block_id
)
if
refcount
==
0
:
if
refcount
==
0
:
self
.
_free_block_indices
.
appendleft
(
block_id
)
self
.
_free_block_indices
.
appendleft
(
block_id
)
block
.
block_id
=
None
def
free
(
self
,
block
:
Block
,
keep_block_object
:
bool
=
False
)
->
None
:
def
free
(
self
,
block
:
Block
,
keep_block_object
:
bool
=
False
)
->
None
:
# Release the physical block id
# Release the physical block id
self
.
_free_block_id
(
block
)
self
.
_free_block_id
(
block
)
...
@@ -154,6 +156,9 @@ class NaiveBlockAllocator(BlockAllocator):
...
@@ -154,6 +156,9 @@ class NaiveBlockAllocator(BlockAllocator):
if
not
keep_block_object
:
if
not
keep_block_object
:
self
.
_block_pool
.
free_block
(
block
)
self
.
_block_pool
.
free_block
(
block
)
def
free_block_id
(
self
,
block_id
:
BlockId
)
->
None
:
self
.
_free_block_id
(
block_id
)
def
fork
(
self
,
last_block
:
Block
)
->
List
[
Block
]:
def
fork
(
self
,
last_block
:
Block
)
->
List
[
Block
]:
"""Creates a new sequence of blocks that shares the same underlying
"""Creates a new sequence of blocks that shares the same underlying
memory as the original sequence.
memory as the original sequence.
...
@@ -325,6 +330,10 @@ class NaiveBlockAllocator(BlockAllocator):
...
@@ -325,6 +330,10 @@ class NaiveBlockAllocator(BlockAllocator):
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
return
-
1
return
-
1
def
reset_prefix_cache
(
self
)
->
bool
:
"""No prefix cache for naive block allocator."""
return
True
def
find_cached_blocks_prefix
(
self
,
block_hashes
:
List
[
int
])
->
List
[
int
]:
def
find_cached_blocks_prefix
(
self
,
block_hashes
:
List
[
int
])
->
List
[
int
]:
# Not applicable for naive block allocator.
# Not applicable for naive block allocator.
return
[]
return
[]
...
...
vllm/core/block/prefix_caching_block.py
View file @
7206ce4c
...
@@ -12,6 +12,7 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
...
@@ -12,6 +12,7 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
from
vllm.core.block.naive_block
import
(
BlockPool
,
NaiveBlock
,
from
vllm.core.block.naive_block
import
(
BlockPool
,
NaiveBlock
,
NaiveBlockAllocator
)
NaiveBlockAllocator
)
from
vllm.core.evictor
import
EvictionPolicy
,
Evictor
,
make_evictor
from
vllm.core.evictor
import
EvictionPolicy
,
Evictor
,
make_evictor
from
vllm.logger
import
init_logger
from
vllm.sequence
import
Sequence
from
vllm.sequence
import
Sequence
PrefixHash
=
int
PrefixHash
=
int
...
@@ -21,6 +22,8 @@ PrefixHash = int
...
@@ -21,6 +22,8 @@ PrefixHash = int
# then we know this block hasn't been accessed yet.
# then we know this block hasn't been accessed yet.
_DEFAULT_LAST_ACCESSED_TIME
=
-
1
_DEFAULT_LAST_ACCESSED_TIME
=
-
1
logger
=
init_logger
(
__name__
)
class
BlockTracker
:
class
BlockTracker
:
"""Used to track the status of a block inside the prefix caching allocator
"""Used to track the status of a block inside the prefix caching allocator
...
@@ -105,7 +108,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
...
@@ -105,7 +108,8 @@ class PrefixCachingBlockAllocator(BlockAllocator):
# Evitor used to maintain how we want to handle those computed blocks
# Evitor used to maintain how we want to handle those computed blocks
# if we find memory pressure is high.
# if we find memory pressure is high.
self
.
evictor
:
Evictor
=
make_evictor
(
eviction_policy
)
self
.
eviction_policy
=
eviction_policy
self
.
evictor
:
Evictor
=
make_evictor
(
self
.
eviction_policy
)
# We share the refcounter between allocators. This allows us to promote
# We share the refcounter between allocators. This allows us to promote
# blocks originally allocated in the hashless allocator to immutable
# blocks originally allocated in the hashless allocator to immutable
...
@@ -428,6 +432,44 @@ class PrefixCachingBlockAllocator(BlockAllocator):
...
@@ -428,6 +432,44 @@ class PrefixCachingBlockAllocator(BlockAllocator):
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
)
->
float
:
return
self
.
metric_data
.
get_hit_rate
()
return
self
.
metric_data
.
get_hit_rate
()
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache. This function may be used in RLHF
flows to invalid prefix caching after the weights are updated,
or used for resetting prefix caching status for benchmarking.
Returns:
bool: True if the prefix cache is successfully reset,
False otherwise.
"""
num_used_blocks
=
(
self
.
get_num_total_blocks
()
-
self
.
get_num_free_blocks
())
if
num_used_blocks
>
0
:
logger
.
warning
(
"Failed to reset prefix cache because some "
"blocks (%d) are not freed yet"
,
num_used_blocks
)
return
False
# Free all blocks in the evictor.
while
(
block_id
:
=
self
.
_maybe_allocate_evicted_block_id
())
is
not
None
:
self
.
_hashless_allocator
.
free_block_id
(
block_id
)
# Should not have any cached blocks because all blocks are evicted.
assert
not
self
.
_cached_blocks
# Reset the evictor.
self
.
evictor
=
make_evictor
(
self
.
eviction_policy
)
# Reset the block tracker.
for
block_id
in
self
.
_block_tracker
:
self
.
_block_tracker
[
block_id
]
=
BlockTracker
()
# Reset the metrics.
self
.
metric_data
=
CacheMetricData
()
logger
.
info
(
"Successfully reset prefix cache"
)
return
True
def
is_block_cached
(
self
,
block
:
Block
)
->
bool
:
def
is_block_cached
(
self
,
block
:
Block
)
->
bool
:
assert
block
.
content_hash
is
not
None
assert
block
.
content_hash
is
not
None
return
block
.
content_hash
in
self
.
_cached_blocks
return
block
.
content_hash
in
self
.
_cached_blocks
...
...
vllm/core/block_manager.py
View file @
7206ce4c
...
@@ -455,6 +455,9 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
...
@@ -455,6 +455,9 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
return
self
.
block_allocator
.
get_prefix_cache_hit_rate
(
device
)
return
self
.
block_allocator
.
get_prefix_cache_hit_rate
(
device
)
def
reset_prefix_cache
(
self
)
->
bool
:
return
self
.
block_allocator
.
reset_prefix_cache
()
def
_can_swap
(
self
,
def
_can_swap
(
self
,
seq_group
:
SequenceGroup
,
seq_group
:
SequenceGroup
,
device
:
Device
,
device
:
Device
,
...
...
vllm/core/interfaces.py
View file @
7206ce4c
...
@@ -122,6 +122,11 @@ class BlockSpaceManager(ABC):
...
@@ -122,6 +122,11 @@ class BlockSpaceManager(ABC):
"""Prefix cache hit rate. -1 means not supported or disabled."""
"""Prefix cache hit rate. -1 means not supported or disabled."""
pass
pass
@
abstractmethod
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache for all devices."""
pass
@
abstractmethod
@
abstractmethod
def
get_num_cached_tokens
(
self
,
seq
:
Sequence
)
->
int
:
def
get_num_cached_tokens
(
self
,
seq
:
Sequence
)
->
int
:
pass
pass
vllm/core/placeholder_block_space_manager.py
View file @
7206ce4c
...
@@ -90,5 +90,8 @@ class PlaceholderBlockSpaceManager(BlockSpaceManager):
...
@@ -90,5 +90,8 @@ class PlaceholderBlockSpaceManager(BlockSpaceManager):
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
return
-
1
return
-
1
def
reset_prefix_cache
(
self
)
->
bool
:
return
True
def
get_num_cached_tokens
(
self
,
seq
:
Sequence
)
->
int
:
def
get_num_cached_tokens
(
self
,
seq
:
Sequence
)
->
int
:
return
0
return
0
vllm/core/scheduler.py
View file @
7206ce4c
...
@@ -504,6 +504,9 @@ class Scheduler:
...
@@ -504,6 +504,9 @@ class Scheduler:
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
def
get_prefix_cache_hit_rate
(
self
,
device
:
Device
)
->
float
:
return
self
.
block_manager
.
get_prefix_cache_hit_rate
(
device
)
return
self
.
block_manager
.
get_prefix_cache_hit_rate
(
device
)
def
reset_prefix_cache
(
self
)
->
bool
:
return
self
.
block_manager
.
reset_prefix_cache
()
def
get_num_unfinished_seq_groups
(
self
)
->
int
:
def
get_num_unfinished_seq_groups
(
self
)
->
int
:
return
len
(
self
.
waiting
)
+
len
(
self
.
running
)
+
len
(
self
.
swapped
)
return
len
(
self
.
waiting
)
+
len
(
self
.
running
)
+
len
(
self
.
swapped
)
...
...
vllm/engine/async_llm_engine.py
View file @
7206ce4c
...
@@ -1182,6 +1182,9 @@ class AsyncLLMEngine(EngineClient):
...
@@ -1182,6 +1182,9 @@ class AsyncLLMEngine(EngineClient):
async
def
stop_profile
(
self
)
->
None
:
async
def
stop_profile
(
self
)
->
None
:
self
.
engine
.
stop_profile
()
self
.
engine
.
stop_profile
()
async
def
reset_prefix_cache
(
self
)
->
None
:
self
.
engine
.
reset_prefix_cache
()
async
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
None
:
async
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
None
:
self
.
engine
.
add_lora
(
lora_request
)
self
.
engine
.
add_lora
(
lora_request
)
...
...
vllm/engine/llm_engine.py
View file @
7206ce4c
...
@@ -914,6 +914,14 @@ class LLMEngine:
...
@@ -914,6 +914,14 @@ class LLMEngine:
"""
"""
return
self
.
scheduler
[
virtual_engine
].
has_unfinished_seqs
()
return
self
.
scheduler
[
virtual_engine
].
has_unfinished_seqs
()
def
reset_prefix_cache
(
self
)
->
bool
:
"""Reset prefix cache for all devices."""
success
=
True
for
scheduler
in
self
.
scheduler
:
success
=
success
and
scheduler
.
reset_prefix_cache
()
return
success
@
staticmethod
@
staticmethod
def
_process_sequence_group_outputs
(
def
_process_sequence_group_outputs
(
seq_group
:
SequenceGroup
,
seq_group
:
SequenceGroup
,
...
...
vllm/engine/multiprocessing/__init__.py
View file @
7206ce4c
...
@@ -121,6 +121,10 @@ class RPCUProfileRequest(Enum):
...
@@ -121,6 +121,10 @@ class RPCUProfileRequest(Enum):
STOP_PROFILE
=
2
STOP_PROFILE
=
2
class
RPCResetPrefixCacheRequest
(
Enum
):
RESET_PREFIX_CACHE
=
1
@
dataclass
@
dataclass
class
RPCLoadAdapterRequest
:
class
RPCLoadAdapterRequest
:
lora_request
:
LoRARequest
lora_request
:
LoRARequest
...
@@ -134,7 +138,8 @@ class RPCAdapterLoadedResponse:
...
@@ -134,7 +138,8 @@ class RPCAdapterLoadedResponse:
RPC_REQUEST_T
=
Union
[
RPCProcessRequest
,
RPCAbortRequest
,
RPCStartupRequest
,
RPC_REQUEST_T
=
Union
[
RPCProcessRequest
,
RPCAbortRequest
,
RPCStartupRequest
,
RPCUProfileRequest
,
RPCLoadAdapterRequest
]
RPCUProfileRequest
,
RPCLoadAdapterRequest
,
RPCResetPrefixCacheRequest
]
REQUEST_OUTPUTS_T
=
Union
[
List
[
RequestOutput
],
RPCAdapterLoadedResponse
,
REQUEST_OUTPUTS_T
=
Union
[
List
[
RequestOutput
],
RPCAdapterLoadedResponse
,
RPCError
]
RPCError
]
...
...
vllm/engine/multiprocessing/client.py
View file @
7206ce4c
...
@@ -27,8 +27,9 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
...
@@ -27,8 +27,9 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
VLLM_RPC_SUCCESS_STR
,
RPCAbortRequest
,
VLLM_RPC_SUCCESS_STR
,
RPCAbortRequest
,
RPCAdapterLoadedResponse
,
RPCError
,
RPCAdapterLoadedResponse
,
RPCError
,
RPCLoadAdapterRequest
,
RPCLoadAdapterRequest
,
RPCProcessRequest
,
RPCStartupRequest
,
RPCProcessRequest
,
RPCStartupResponse
,
RPCResetPrefixCacheRequest
,
RPCStartupRequest
,
RPCStartupResponse
,
RPCUProfileRequest
)
RPCUProfileRequest
)
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
# yapf: enable
# yapf: enable
...
@@ -675,6 +676,13 @@ class MQLLMEngineClient(EngineClient):
...
@@ -675,6 +676,13 @@ class MQLLMEngineClient(EngineClient):
await
self
.
_send_one_way_rpc_request
(
await
self
.
_send_one_way_rpc_request
(
request
=
RPCUProfileRequest
.
STOP_PROFILE
,
socket
=
self
.
input_socket
)
request
=
RPCUProfileRequest
.
STOP_PROFILE
,
socket
=
self
.
input_socket
)
async
def
reset_prefix_cache
(
self
)
->
None
:
"""Reset the prefix cache"""
await
self
.
_send_one_way_rpc_request
(
request
=
RPCResetPrefixCacheRequest
.
RESET_PREFIX_CACHE
,
socket
=
self
.
input_socket
)
async
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
None
:
async
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
None
:
"""Load a new LoRA adapter into the engine for future requests."""
"""Load a new LoRA adapter into the engine for future requests."""
# Uses the same I/O as generate requests
# Uses the same I/O as generate requests
...
...
vllm/engine/multiprocessing/engine.py
View file @
7206ce4c
...
@@ -16,8 +16,9 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
...
@@ -16,8 +16,9 @@ from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
VLLM_RPC_SUCCESS_STR
,
RPCAbortRequest
,
VLLM_RPC_SUCCESS_STR
,
RPCAbortRequest
,
RPCAdapterLoadedResponse
,
RPCError
,
RPCAdapterLoadedResponse
,
RPCError
,
RPCLoadAdapterRequest
,
RPCLoadAdapterRequest
,
RPCProcessRequest
,
RPCStartupRequest
,
RPCProcessRequest
,
RPCStartupResponse
,
RPCResetPrefixCacheRequest
,
RPCStartupRequest
,
RPCStartupResponse
,
RPCUProfileRequest
)
RPCUProfileRequest
)
# yapf: enable
# yapf: enable
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -237,6 +238,8 @@ class MQLLMEngine:
...
@@ -237,6 +238,8 @@ class MQLLMEngine:
self
.
stop_profile
()
self
.
stop_profile
()
elif
isinstance
(
request
,
RPCLoadAdapterRequest
):
elif
isinstance
(
request
,
RPCLoadAdapterRequest
):
self
.
_handle_load_adapter_request
(
request
)
self
.
_handle_load_adapter_request
(
request
)
elif
isinstance
(
request
,
RPCResetPrefixCacheRequest
):
self
.
reset_prefix_cache
()
else
:
else
:
raise
ValueError
(
"Unknown RPCRequest Type: "
raise
ValueError
(
"Unknown RPCRequest Type: "
f
"
{
type
(
request
)
}
"
)
f
"
{
type
(
request
)
}
"
)
...
@@ -361,6 +364,9 @@ class MQLLMEngine:
...
@@ -361,6 +364,9 @@ class MQLLMEngine:
def
stop_profile
(
self
)
->
None
:
def
stop_profile
(
self
)
->
None
:
self
.
engine
.
stop_profile
()
self
.
engine
.
stop_profile
()
def
reset_prefix_cache
(
self
)
->
bool
:
return
self
.
engine
.
reset_prefix_cache
()
def
signal_handler
(
*
_
)
->
None
:
def
signal_handler
(
*
_
)
->
None
:
raise
KeyboardInterrupt
(
"MQLLMEngine terminated"
)
raise
KeyboardInterrupt
(
"MQLLMEngine terminated"
)
...
...
vllm/engine/protocol.py
View file @
7206ce4c
...
@@ -271,6 +271,11 @@ class EngineClient(ABC):
...
@@ -271,6 +271,11 @@ class EngineClient(ABC):
"""Start profiling the engine"""
"""Start profiling the engine"""
...
...
@
abstractmethod
async
def
reset_prefix_cache
(
self
)
->
None
:
"""Reset the prefix cache"""
...
@
abstractmethod
@
abstractmethod
async
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
None
:
async
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
None
:
"""Load a new LoRA adapter into the engine for future requests."""
"""Load a new LoRA adapter into the engine for future requests."""
...
...
vllm/entrypoints/llm.py
View file @
7206ce4c
...
@@ -1132,6 +1132,9 @@ class LLM:
...
@@ -1132,6 +1132,9 @@ class LLM:
def
stop_profile
(
self
)
->
None
:
def
stop_profile
(
self
)
->
None
:
self
.
llm_engine
.
stop_profile
()
self
.
llm_engine
.
stop_profile
()
def
reset_prefix_cache
(
self
)
->
bool
:
return
self
.
llm_engine
.
reset_prefix_cache
()
def
sleep
(
self
,
level
:
int
=
1
):
def
sleep
(
self
,
level
:
int
=
1
):
"""
"""
Put the engine to sleep. The engine should not process any requests.
Put the engine to sleep. The engine should not process any requests.
...
@@ -1150,6 +1153,7 @@ class LLM:
...
@@ -1150,6 +1153,7 @@ class LLM:
where previous model weights are not needed. It reduces CPU memory
where previous model weights are not needed. It reduces CPU memory
pressure.
pressure.
"""
"""
self
.
reset_prefix_cache
()
self
.
llm_engine
.
sleep
(
level
=
level
)
self
.
llm_engine
.
sleep
(
level
=
level
)
def
wake_up
(
self
):
def
wake_up
(
self
):
...
...
vllm/entrypoints/openai/api_server.py
View file @
7206ce4c
...
@@ -518,6 +518,18 @@ TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
...
@@ -518,6 +518,18 @@ TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
},
},
}
}
if
envs
.
VLLM_SERVER_DEV_MODE
:
@
router
.
post
(
"/reset_prefix_cache"
)
async
def
reset_prefix_cache
(
raw_request
:
Request
):
"""
Reset the prefix cache. Note that we currently do not check if the
prefix cache is successfully reset in the API server.
"""
logger
.
info
(
"Resetting prefix cache..."
)
await
engine_client
(
raw_request
).
reset_prefix_cache
()
return
Response
(
status_code
=
200
)
@
router
.
post
(
"/invocations"
)
@
router
.
post
(
"/invocations"
)
async
def
invocations
(
raw_request
:
Request
):
async
def
invocations
(
raw_request
:
Request
):
...
...
vllm/envs.py
View file @
7206ce4c
...
@@ -72,6 +72,7 @@ if TYPE_CHECKING:
...
@@ -72,6 +72,7 @@ if TYPE_CHECKING:
VLLM_ENABLE_V1_MULTIPROCESSING
:
bool
=
True
VLLM_ENABLE_V1_MULTIPROCESSING
:
bool
=
True
VLLM_LOG_BATCHSIZE_INTERVAL
:
float
=
-
1
VLLM_LOG_BATCHSIZE_INTERVAL
:
float
=
-
1
VLLM_DISABLE_COMPILE_CACHE
:
bool
=
False
VLLM_DISABLE_COMPILE_CACHE
:
bool
=
False
VLLM_SERVER_DEV_MODE
:
bool
=
False
def
get_default_cache_root
():
def
get_default_cache_root
():
...
@@ -467,6 +468,12 @@ environment_variables: Dict[str, Callable[[], Any]] = {
...
@@ -467,6 +468,12 @@ environment_variables: Dict[str, Callable[[], Any]] = {
lambda
:
float
(
os
.
getenv
(
"VLLM_LOG_BATCHSIZE_INTERVAL"
,
"-1"
)),
lambda
:
float
(
os
.
getenv
(
"VLLM_LOG_BATCHSIZE_INTERVAL"
,
"-1"
)),
"VLLM_DISABLE_COMPILE_CACHE"
:
"VLLM_DISABLE_COMPILE_CACHE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"0"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_DISABLE_COMPILE_CACHE"
,
"0"
))),
# If set, vllm will run in development mode, which will enable
# some additional endpoints for developing and debugging,
# e.g. `/reset_prefix_cache`
"VLLM_SERVER_DEV_MODE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_SERVER_DEV_MODE"
,
"0"
))),
}
}
# end-env-vars-definition
# end-env-vars-definition
...
...
vllm/executor/executor_base.py
View file @
7206ce4c
...
@@ -194,11 +194,6 @@ class ExecutorBase(ABC):
...
@@ -194,11 +194,6 @@ class ExecutorBase(ABC):
self
.
collective_rpc
(
"stop_profile"
)
self
.
collective_rpc
(
"stop_profile"
)
def
sleep
(
self
,
level
:
int
=
1
):
def
sleep
(
self
,
level
:
int
=
1
):
if
self
.
cache_config
.
enable_prefix_caching
:
# TODO: support sleep with prefix caching
# by resetting the prefix cache state,
# after https://github.com/vllm-project/vllm/pull/12284
raise
ValueError
(
"Cannot sleep when prefix caching is enabled."
)
self
.
collective_rpc
(
"sleep"
,
kwargs
=
dict
(
level
=
level
))
self
.
collective_rpc
(
"sleep"
,
kwargs
=
dict
(
level
=
level
))
def
wake_up
(
self
):
def
wake_up
(
self
):
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment