[Bug] Fix the OOM condition for CPU cache (#260)

0b7db411 · Zhuohan Li · GitHub · 471a7a45 · 0b7db411 · 0b7db411
Unverified Commit 0b7db411 authored Jun 26, 2023 by Zhuohan Li Committed by GitHub Jun 26, 2023
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 1 deletion

vllm/engine/llm_engine.py vllm/engine/llm_engine.py +1 -1

vllm/worker/worker.py vllm/worker/worker.py +2 -0

No files found.
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -128,7 +128,7 @@ class LLMEngine:
        logger.info(f'# GPU blocks: {num_gpu_blocks}, '
                    f'# CPU blocks: {num_cpu_blocks}')

-        if num_gpu_blocks <= 0 or num_cpu_blocks <= 0:
+        if num_gpu_blocks <= 0:
            raise ValueError("No available memory for the cache blocks. "
                             "Try increasing `gpu_memory_utilization` when "
                             "initializing the engine.")

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -113,6 +113,8 @@ class Worker:
        num_gpu_blocks = int((total_gpu_memory * gpu_memory_utilization
                              - peak_memory) // cache_block_size)
        num_cpu_blocks = int(cpu_swap_space // cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
        torch.cuda.empty_cache()

        # Reset the seed to ensure that the random state is not affected by