[Startup][UX] Enable CUDAGraph memory profiling by default (#38284)

Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>

[Startup][UX] Enable CUDAGraph memory profiling by default (#38284)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
96a85c57 · Matthew Bonanni · GitHub · 9db4650e · 96a85c57 · 96a85c57
Unverified Commit 96a85c57 authored Apr 21, 2026 by Matthew Bonanni Committed by GitHub Apr 21, 2026
8 changed files
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -29,7 +29,7 @@ llm = LLM(
    tensor_parallel_size=2,
    pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
    distributed_executor_backend="external_launcher",
-    gpu_memory_utilization=random.uniform(0.7, 0.9),
+    gpu_memory_utilization=random.uniform(0.8, 0.92),
    seed=0,
 )


--- a/tests/distributed/test_torchrun_example_moe.py
+++ b/tests/distributed/test_torchrun_example_moe.py
@@ -36,7 +36,7 @@ llm = LLM(
    pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
    enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
    distributed_executor_backend="external_launcher",
-    gpu_memory_utilization=random.uniform(0.7, 0.9),
+    gpu_memory_utilization=random.uniform(0.8, 0.92),
    seed=0,
    max_model_len=1024,
    max_num_seqs=16,

--- a/tests/v1/determinism/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -65,7 +65,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
    assert max_batch_size >= 2, "Batch size should be >= 2 to mix needle."

    # Keep GPU memory usage low to avoid startup allocation failures.
-    gpu_mem_util = float(os.getenv("VLLM_GPU_MEMORY_UTILIZATION", "0.4"))
+    gpu_mem_util = float(os.getenv("VLLM_GPU_MEMORY_UTILIZATION", "0.5"))
    max_model_len = int(os.getenv("VLLM_MAX_MODEL_LEN", "5120"))

    # Sampling parameters: longer outputs with a more random-sounding

--- a/tests/v1/e2e/spec_decode/test_spec_decode.py
+++ b/tests/v1/e2e/spec_decode/test_spec_decode.py
@@ -321,7 +321,7 @@ def test_speculators_model_integration(
    test_prompts = get_test_prompts(mm_enabled=False)

    # First run: Direct speculator model (simplified integration)
-    spec_llm = LLM(model=model_path, max_model_len=4096)
+    spec_llm = LLM(model=model_path, max_model_len=4096, gpu_memory_utilization=0.92)
    evaluate_llm_for_gsm8k(
        spec_llm, expected_accuracy_threshold=expected_accuracy_threshold
    )
@@ -351,7 +351,7 @@ def test_speculators_model_integration(
    cleanup_dist_env_and_memory()

    # Second run: Reference without speculative decoding
-    ref_llm = LLM(model=verifier_model, max_model_len=4096)
+    ref_llm = LLM(model=verifier_model, max_model_len=4096, gpu_memory_utilization=0.92)
    ref_outputs = ref_llm.chat(test_prompts, sampling_config)
    del ref_llm
    torch.accelerator.empty_cache()

--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -51,10 +51,10 @@ class CacheConfig:
    """Whether block_size was explicitly provided. Derived automatically."""
    user_specified_mamba_block_size: bool = field(default=False, init=False)
    """Whether mamba_block_size was explicitly provided. Derived automatically."""
-    gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
+    gpu_memory_utilization: float = Field(default=0.92, gt=0, le=1)
    """The fraction of GPU memory to be used for the model executor, which can
    range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
-    utilization. If unspecified, will use the default value of 0.9. This is a
+    utilization. If unspecified, will use the default value of 0.92. This is a
    per-instance limit, and only applies to the current vLLM instance. It does
    not matter if you have another vLLM instance running on the same GPU. For
    example, if you have two vLLM instances running on the same GPU, you can

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -228,7 +228,7 @@ class LLM:
        tokenizer_revision: str | None = None,
        chat_template: Path | str | None = None,
        seed: int = 0,
-        gpu_memory_utilization: float = 0.9,
+        gpu_memory_utilization: float = 0.92,
        cpu_offload_gb: float = 0,
        offload_group_size: int = 0,
        offload_num_in_group: int = 1,

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -253,7 +253,7 @@ if TYPE_CHECKING:
    VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
    VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
    VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
-    VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = False
+    VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = True
    VLLM_NIXL_EP_MAX_NUM_RANKS: int = 32
    VLLM_XPU_ENABLE_XPU_GRAPH: bool = False
    VLLM_LORA_ENABLE_DUAL_STREAM: bool = False
@@ -1687,9 +1687,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
    ),
    # If set to 1, enable CUDA graph memory estimation during memory profiling.
    # This profiles CUDA graph memory usage to provide more accurate KV cache
-    # memory allocation. Disabled by default to preserve existing behavior.
+    # memory allocation. Enabled by default as of v0.21.0
    "VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS": lambda: bool(
-        int(os.getenv("VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS", "0"))
+        int(os.getenv("VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS", "1"))
    ),
    # NIXL EP environment variables
    "VLLM_NIXL_EP_MAX_NUM_RANKS": lambda: int(

--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -454,14 +454,13 @@ class Worker(WorkerBase):
                    1.0,
                )
                logger.info(
-                    "CUDA graph memory profiling is enabled "
-                    "(VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1). "
-                    "This will become the default in v0.21. "
-                    "The current --gpu-memory-utilization=%.4f is equivalent "
-                    "to --gpu-memory-utilization=%.4f without CUDA graph "
-                    "memory profiling. To maintain the same effective KV "
-                    "cache size as before, increase "
-                    "--gpu-memory-utilization to %.4f.",
+                    "CUDA graph memory profiling is enabled (default since "
+                    "v0.21.0). The current --gpu-memory-utilization=%.4f is "
+                    "equivalent to --gpu-memory-utilization=%.4f without "
+                    "CUDA graph memory profiling. To maintain the same "
+                    "effective KV cache size as before, increase "
+                    "--gpu-memory-utilization to %.4f. To disable, set "
+                    "VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0.",
                    current_util,
                    equiv_util,
                    suggested_util,
@@ -471,14 +470,14 @@ class Worker(WorkerBase):
                    round(current_util + cg_util_delta, 4),
                    1.0,
                )
-                logger.info(
-                    "In v0.21, CUDA graph memory profiling will be enabled "
-                    "by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), "
-                    "which more accurately accounts for CUDA graph memory "
-                    "during KV cache allocation. To try it now, set "
-                    "VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase "
-                    "--gpu-memory-utilization from %.4f to %.4f to maintain "
-                    "the same effective KV cache size.",
+                logger.warning(
+                    "CUDA graph memory profiling is disabled "
+                    "(VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0). "
+                    "Without it, CUDA graph memory is not accounted for "
+                    "during KV cache allocation, which may require lowering "
+                    "--gpu-memory-utilization to avoid OOM. Consider "
+                    "re-enabling it (the default as of v0.21.0) and increasing "
+                    "--gpu-memory-utilization from %.4f to %.4f.",
                    current_util,
                    suggested_util,
                )