Unverified Commit 96a85c57 authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

[Startup][UX] Enable CUDAGraph memory profiling by default (#38284)


Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
Co-authored-by: default avatarTyler Michael Smith <tlrmchlsmth@gmail.com>
parent 9db4650e
......@@ -29,7 +29,7 @@ llm = LLM(
tensor_parallel_size=2,
pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
gpu_memory_utilization=random.uniform(0.8, 0.92),
seed=0,
)
......
......@@ -36,7 +36,7 @@ llm = LLM(
pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
gpu_memory_utilization=random.uniform(0.8, 0.92),
seed=0,
max_model_len=1024,
max_num_seqs=16,
......
......@@ -65,7 +65,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
assert max_batch_size >= 2, "Batch size should be >= 2 to mix needle."
# Keep GPU memory usage low to avoid startup allocation failures.
gpu_mem_util = float(os.getenv("VLLM_GPU_MEMORY_UTILIZATION", "0.4"))
gpu_mem_util = float(os.getenv("VLLM_GPU_MEMORY_UTILIZATION", "0.5"))
max_model_len = int(os.getenv("VLLM_MAX_MODEL_LEN", "5120"))
# Sampling parameters: longer outputs with a more random-sounding
......
......@@ -321,7 +321,7 @@ def test_speculators_model_integration(
test_prompts = get_test_prompts(mm_enabled=False)
# First run: Direct speculator model (simplified integration)
spec_llm = LLM(model=model_path, max_model_len=4096)
spec_llm = LLM(model=model_path, max_model_len=4096, gpu_memory_utilization=0.92)
evaluate_llm_for_gsm8k(
spec_llm, expected_accuracy_threshold=expected_accuracy_threshold
)
......@@ -351,7 +351,7 @@ def test_speculators_model_integration(
cleanup_dist_env_and_memory()
# Second run: Reference without speculative decoding
ref_llm = LLM(model=verifier_model, max_model_len=4096)
ref_llm = LLM(model=verifier_model, max_model_len=4096, gpu_memory_utilization=0.92)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm
torch.accelerator.empty_cache()
......
......@@ -51,10 +51,10 @@ class CacheConfig:
"""Whether block_size was explicitly provided. Derived automatically."""
user_specified_mamba_block_size: bool = field(default=False, init=False)
"""Whether mamba_block_size was explicitly provided. Derived automatically."""
gpu_memory_utilization: float = Field(default=0.9, gt=0, le=1)
gpu_memory_utilization: float = Field(default=0.92, gt=0, le=1)
"""The fraction of GPU memory to be used for the model executor, which can
range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory
utilization. If unspecified, will use the default value of 0.9. This is a
utilization. If unspecified, will use the default value of 0.92. This is a
per-instance limit, and only applies to the current vLLM instance. It does
not matter if you have another vLLM instance running on the same GPU. For
example, if you have two vLLM instances running on the same GPU, you can
......
......@@ -228,7 +228,7 @@ class LLM:
tokenizer_revision: str | None = None,
chat_template: Path | str | None = None,
seed: int = 0,
gpu_memory_utilization: float = 0.9,
gpu_memory_utilization: float = 0.92,
cpu_offload_gb: float = 0,
offload_group_size: int = 0,
offload_num_in_group: int = 1,
......
......@@ -253,7 +253,7 @@ if TYPE_CHECKING:
VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = False
VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = True
VLLM_NIXL_EP_MAX_NUM_RANKS: int = 32
VLLM_XPU_ENABLE_XPU_GRAPH: bool = False
VLLM_LORA_ENABLE_DUAL_STREAM: bool = False
......@@ -1687,9 +1687,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
),
# If set to 1, enable CUDA graph memory estimation during memory profiling.
# This profiles CUDA graph memory usage to provide more accurate KV cache
# memory allocation. Disabled by default to preserve existing behavior.
# memory allocation. Enabled by default as of v0.21.0
"VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS": lambda: bool(
int(os.getenv("VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS", "0"))
int(os.getenv("VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS", "1"))
),
# NIXL EP environment variables
"VLLM_NIXL_EP_MAX_NUM_RANKS": lambda: int(
......
......@@ -454,14 +454,13 @@ class Worker(WorkerBase):
1.0,
)
logger.info(
"CUDA graph memory profiling is enabled "
"(VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1). "
"This will become the default in v0.21. "
"The current --gpu-memory-utilization=%.4f is equivalent "
"to --gpu-memory-utilization=%.4f without CUDA graph "
"memory profiling. To maintain the same effective KV "
"cache size as before, increase "
"--gpu-memory-utilization to %.4f.",
"CUDA graph memory profiling is enabled (default since "
"v0.21.0). The current --gpu-memory-utilization=%.4f is "
"equivalent to --gpu-memory-utilization=%.4f without "
"CUDA graph memory profiling. To maintain the same "
"effective KV cache size as before, increase "
"--gpu-memory-utilization to %.4f. To disable, set "
"VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0.",
current_util,
equiv_util,
suggested_util,
......@@ -471,14 +470,14 @@ class Worker(WorkerBase):
round(current_util + cg_util_delta, 4),
1.0,
)
logger.info(
"In v0.21, CUDA graph memory profiling will be enabled "
"by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), "
"which more accurately accounts for CUDA graph memory "
"during KV cache allocation. To try it now, set "
"VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase "
"--gpu-memory-utilization from %.4f to %.4f to maintain "
"the same effective KV cache size.",
logger.warning(
"CUDA graph memory profiling is disabled "
"(VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=0). "
"Without it, CUDA graph memory is not accounted for "
"during KV cache allocation, which may require lowering "
"--gpu-memory-utilization to avoid OOM. Consider "
"re-enabling it (the default as of v0.21.0) and increasing "
"--gpu-memory-utilization from %.4f to %.4f.",
current_util,
suggested_util,
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment