Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a79b1224
Unverified
Commit
a79b1224
authored
Nov 28, 2024
by
Woosuk Kwon
Committed by
GitHub
Nov 28, 2024
Browse files
[V1] Do not allocate beyond the max_model_len (#10730)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
d9b4b3f0
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
41 additions
and
15 deletions
+41
-15
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_prefix_caching.py
+16
-8
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_manager.py
+17
-0
vllm/v1/core/scheduler.py
vllm/v1/core/scheduler.py
+8
-7
No files found.
tests/v1/core/test_prefix_caching.py
View file @
a79b1224
...
...
@@ -23,7 +23,8 @@ def test_prefill():
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
sliding_window
=
False
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
16
,
)
...
...
@@ -121,7 +122,8 @@ def test_decode():
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
sliding_window
=
False
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
16
,
)
...
...
@@ -172,7 +174,8 @@ def test_evict():
manager
=
KVCacheManager
(
block_size
=
16
,
num_gpu_blocks
=
10
,
sliding_window
=
False
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
16
,
)
...
...
@@ -220,7 +223,8 @@ def test_hash_block_correct_reuse():
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
1
,
sliding_window
=
False
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
...
...
@@ -256,7 +260,8 @@ def test_computed_blocks_not_evicted():
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
2
,
sliding_window
=
False
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
...
...
@@ -303,7 +308,8 @@ def test_basic_prefix_caching_disabled():
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
4
,
sliding_window
=
False
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
False
,
num_preallocate_tokens
=
0
,
)
...
...
@@ -342,7 +348,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
10
,
sliding_window
=
False
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
num_preallocate_tokens
,
)
...
...
@@ -370,7 +377,8 @@ def test_cache_blocks():
manager
=
KVCacheManager
(
block_size
=
block_size
,
num_gpu_blocks
=
5
,
sliding_window
=
False
,
max_model_len
=
8192
,
sliding_window
=
None
,
enable_caching
=
True
,
num_preallocate_tokens
=
0
,
)
...
...
vllm/v1/core/kv_cache_manager.py
View file @
a79b1224
...
...
@@ -17,12 +17,15 @@ class KVCacheManager:
self
,
block_size
:
int
,
num_gpu_blocks
:
int
,
max_model_len
:
int
,
sliding_window
:
Optional
[
int
]
=
None
,
enable_caching
:
bool
=
True
,
num_preallocate_tokens
:
int
=
64
,
)
->
None
:
self
.
block_size
=
block_size
self
.
num_gpu_blocks
=
num_gpu_blocks
self
.
max_model_len
=
max_model_len
self
.
max_num_blocks_per_req
=
cdiv
(
max_model_len
,
block_size
)
self
.
sliding_window
=
sliding_window
self
.
enable_caching
=
enable_caching
# NOTE(woosuk): To avoid frequent block allocation, we preallocate some
...
...
@@ -132,7 +135,14 @@ class KVCacheManager:
num_new_blocks
=
min
(
num_new_blocks
+
self
.
num_preallocate_blocks
,
self
.
free_block_queue
.
num_free_blocks
,
# Should not exceed the maximum number of blocks per request.
# This is especially because the block table has the shape
# [..., max_num_blocks_per_req].
# TODO(woosuk): Check and reject requests if
# num_prompt_tokens + max_tokens > max_model_len.
self
.
max_num_blocks_per_req
-
len
(
req_blocks
),
)
assert
num_new_blocks
>
0
new_blocks
=
self
.
_get_new_blocks
(
num_new_blocks
)
req_blocks
.
extend
(
new_blocks
)
...
...
@@ -212,7 +222,14 @@ class KVCacheManager:
num_required_blocks
+
self
.
num_preallocate_blocks
,
self
.
free_block_queue
.
num_free_blocks
-
num_evictable_computed_blocks
,
# Should not exceed the maximum number of blocks per request.
# This is especially because the block table has the shape
# [..., max_num_blocks_per_req].
# TODO(woosuk): Check and reject requests if
# num_prompt_tokens + max_tokens > max_model_len.
self
.
max_num_blocks_per_req
-
len
(
computed_blocks
),
)
assert
num_new_blocks
>
0
# Concatenate the computed block IDs and the new block IDs.
new_blocks
=
self
.
_get_new_blocks
(
num_new_blocks
)
...
...
vllm/v1/core/scheduler.py
View file @
a79b1224
...
...
@@ -33,22 +33,23 @@ class Scheduler:
# TODO: Support LoRA.
assert
lora_config
is
None
,
"V1 does not support LoRA yet."
# Scheduling constraints.
self
.
max_num_running_reqs
=
self
.
scheduler_config
.
max_num_seqs
self
.
max_num_scheduled_tokens
=
\
self
.
scheduler_config
.
max_num_batched_tokens
self
.
max_model_len
=
self
.
scheduler_config
.
max_model_len
num_gpu_blocks
=
cache_config
.
num_gpu_blocks
assert
isinstance
(
num_gpu_blocks
,
int
)
and
num_gpu_blocks
>
0
# Create the
block sp
ace manager.
# Create the
KV c
ac
h
e manager.
self
.
kv_cache_manager
=
KVCacheManager
(
block_size
=
self
.
cache_config
.
block_size
,
num_gpu_blocks
=
num_gpu_blocks
,
max_model_len
=
self
.
max_model_len
,
sliding_window
=
self
.
cache_config
.
sliding_window
,
enable_caching
=
self
.
cache_config
.
enable_prefix_caching
)
self
.
block_size
=
self
.
cache_config
.
block_size
# Scheduling constraints.
self
.
max_num_running_reqs
=
self
.
scheduler_config
.
max_num_seqs
self
.
max_num_scheduled_tokens
=
\
self
.
scheduler_config
.
max_num_batched_tokens
self
.
max_model_len
=
self
.
scheduler_config
.
max_model_len
# req_id -> Request
self
.
requests
:
Dict
[
str
,
Request
]
=
{}
# Priority queues for requests.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment