[Fix] Better error message when there is OOM during cache initialization (#203)

1d24ccb9 · Zhuohan Li · GitHub · 14f0b39c · 1d24ccb9 · 1d24ccb9
Unverified Commit 1d24ccb9 authored Jun 22, 2023 by Zhuohan Li Committed by GitHub Jun 22, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 0 deletions

vllm/engine/llm_engine.py vllm/engine/llm_engine.py +6 -0

vllm/outputs.py vllm/outputs.py +1 -0

No files found.
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -127,6 +127,12 @@ class LLMEngine:
        # FIXME(woosuk): Change to debug log.
        logger.info(f'# GPU blocks: {num_gpu_blocks}, '
                    f'# CPU blocks: {num_cpu_blocks}')
+
+        if num_gpu_blocks <= 0 or num_cpu_blocks <= 0:
+            raise ValueError("No available memory for the cache blocks. "
+                             "Try increasing `gpu_memory_utilization` when "
+                             "initializing the engine.")
+
        self.cache_config.num_gpu_blocks = num_gpu_blocks
        self.cache_config.num_cpu_blocks = num_cpu_blocks


--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -53,6 +53,7 @@ class RequestOutput:
        prompt: The prompt string of the request.
        prompt_token_ids: The token IDs of the prompt.
        outputs: The output sequences of the request.
+        finished: Whether the whole request is finished.
    """
    def __init__(
        self,