Add warning on CUDA graph memory usage (#2182)

21d5daa4 · Woosuk Kwon · GitHub · 290e015c · 21d5daa4
Unverified Commit 21d5daa4 authored Dec 18, 2023 by Woosuk Kwon Committed by GitHub Dec 18, 2023
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 0 deletions

vllm/worker/model_runner.py vllm/worker/model_runner.py +3 -0

No files found.
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -395,6 +395,9 @@ class ModelRunner:
                    "unexpected consequences if the model is not static. To "
                    "run the model in eager mode, set 'enforce_eager=True' or "
                    "use '--enforce-eager' in the CLI.")
+        logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. "
+                    "If you are running out of memory, consider decreasing "
+                    "`gpu_memory_utilization` or enforcing eager mode.")
        start_time = time.perf_counter()
        # Prepare dummy inputs. These will be reused for all batch sizes.