[V1] Do not allocate beyond the max_model_len (#10730)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V1] Do not allocate beyond the max_model_len (#10730)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
a79b1224 · Woosuk Kwon · GitHub · d9b4b3f0 · a79b1224 · a79b1224
Unverified Commit a79b1224 authored Nov 28, 2024 by Woosuk Kwon Committed by GitHub Nov 28, 2024
Showing with 41 additions and 15 deletions

tests/v1/core/test_prefix_caching.py tests/v1/core/test_prefix_caching.py +16 -8

vllm/v1/core/kv_cache_manager.py vllm/v1/core/kv_cache_manager.py +17 -0

vllm/v1/core/scheduler.py vllm/v1/core/scheduler.py +8 -7

No files found.
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -23,7 +23,8 @@ def test_prefill():
    manager = KVCacheManager(
        block_size=16,
        num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=16,
    )
@@ -121,7 +122,8 @@ def test_decode():
    manager = KVCacheManager(
        block_size=16,
        num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=16,
    )
@@ -172,7 +174,8 @@ def test_evict():
    manager = KVCacheManager(
        block_size=16,
        num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=16,
    )
@@ -220,7 +223,8 @@ def test_hash_block_correct_reuse():
    manager = KVCacheManager(
        block_size=block_size,
        num_gpu_blocks=1,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=0,
    )
@@ -256,7 +260,8 @@ def test_computed_blocks_not_evicted():
    manager = KVCacheManager(
        block_size=block_size,
        num_gpu_blocks=2,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=0,
    )
@@ -303,7 +308,8 @@ def test_basic_prefix_caching_disabled():
    manager = KVCacheManager(
        block_size=block_size,
        num_gpu_blocks=4,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
        enable_caching=False,
        num_preallocate_tokens=0,
    )
@@ -342,7 +348,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
    manager = KVCacheManager(
        block_size=block_size,
        num_gpu_blocks=10,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=num_preallocate_tokens,
    )
@@ -370,7 +377,8 @@ def test_cache_blocks():
    manager = KVCacheManager(
        block_size=block_size,
        num_gpu_blocks=5,
-        sliding_window=False,
+        max_model_len=8192,
+        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=0,
    )

--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -17,12 +17,15 @@ class KVCacheManager:
        self,
        block_size: int,
        num_gpu_blocks: int,
+        max_model_len: int,
        sliding_window: Optional[int] = None,
        enable_caching: bool = True,
        num_preallocate_tokens: int = 64,
    ) -> None:
        self.block_size = block_size
        self.num_gpu_blocks = num_gpu_blocks
+        self.max_model_len = max_model_len
+        self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
        self.sliding_window = sliding_window
        self.enable_caching = enable_caching
        # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
@@ -132,7 +135,14 @@ class KVCacheManager:
            num_new_blocks = min(
                num_new_blocks + self.num_preallocate_blocks,
                self.free_block_queue.num_free_blocks,
+                # Should not exceed the maximum number of blocks per request.
+                # This is especially because the block table has the shape
+                # [..., max_num_blocks_per_req].
+                # TODO(woosuk): Check and reject requests if
+                # num_prompt_tokens + max_tokens > max_model_len.
+                self.max_num_blocks_per_req - len(req_blocks),
            )
+            assert num_new_blocks > 0
            new_blocks = self._get_new_blocks(num_new_blocks)
            req_blocks.extend(new_blocks)
@@ -212,7 +222,14 @@ class KVCacheManager:
            num_required_blocks + self.num_preallocate_blocks,
            self.free_block_queue.num_free_blocks -
            num_evictable_computed_blocks,
+            # Should not exceed the maximum number of blocks per request.
+            # This is especially because the block table has the shape
+            # [..., max_num_blocks_per_req].
+            # TODO(woosuk): Check and reject requests if
+            # num_prompt_tokens + max_tokens > max_model_len.
+            self.max_num_blocks_per_req - len(computed_blocks),
        )
+        assert num_new_blocks > 0
        # Concatenate the computed block IDs and the new block IDs.
        new_blocks = self._get_new_blocks(num_new_blocks)

--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -33,22 +33,23 @@ class Scheduler:
        # TODO: Support LoRA.
        assert lora_config is None, "V1 does not support LoRA yet."
+        # Scheduling constraints.
+        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_scheduled_tokens = \
+            self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = self.scheduler_config.max_model_len
        num_gpu_blocks = cache_config.num_gpu_blocks
        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
-        # Create the block space manager.
+        # Create the KV cache manager.
        self.kv_cache_manager = KVCacheManager(
            block_size=self.cache_config.block_size,
            num_gpu_blocks=num_gpu_blocks,
+            max_model_len=self.max_model_len,
            sliding_window=self.cache_config.sliding_window,
            enable_caching=self.cache_config.enable_prefix_caching)
        self.block_size = self.cache_config.block_size
-        # Scheduling constraints.
-        self.max_num_running_reqs = self.scheduler_config.max_num_seqs
-        self.max_num_scheduled_tokens = \
-            self.scheduler_config.max_num_batched_tokens
-        self.max_model_len = self.scheduler_config.max_model_len
        # req_id -> Request
        self.requests: Dict[str, Request] = {}
        # Priority queues for requests.