Reduce GPU memory utilization to make sure OOM doesn't happen (#153)

bf5f121c · Zhuohan Li · GitHub · bec7b2dc · bf5f121c
Unverified Commit bf5f121c authored Jun 18, 2023 by Zhuohan Li Committed by GitHub Jun 18, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 1 deletion

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +1 -1

No files found.
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -21,7 +21,7 @@ class EngineArgs:
    tensor_parallel_size: int = 1
    block_size: int = 16
    swap_space: int = 4  # GiB
-    gpu_memory_utilization: float = 0.95
+    gpu_memory_utilization: float = 0.90
    max_num_batched_tokens: int = 2560
    max_num_seqs: int = 256
    disable_log_stats: bool = False