Unverified Commit af32579e authored by Qi Wang's avatar Qi Wang Committed by GitHub
Browse files

fix(vllm): cap gpu_memory_utilization for encoder-only vision model load (#8466)


Co-authored-by: default avatarClaude Opus 4.7 (1M context) <noreply@anthropic.com>
parent 2cd4288a
...@@ -173,6 +173,9 @@ def load_vision_model(model_id: str, enforce_eager: bool = False) -> torch.nn.Mo ...@@ -173,6 +173,9 @@ def load_vision_model(model_id: str, enforce_eager: bool = False) -> torch.nn.Mo
vllm_model = LLM( vllm_model = LLM(
model=model_id, model=model_id,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
# vLLM's free-memory precheck runs before kv_cache_memory_bytes applies;
# default 0.9 fails on <=24 GiB GPUs when another worker shares the device.
gpu_memory_utilization=0.2,
kv_cache_memory_bytes=1024 kv_cache_memory_bytes=1024
* 1024 * 1024
* 64, # 64MB KV cache for vLLM to complete the init lifecycle, encoder-only doesn't require KV cache. * 64, # 64MB KV cache for vLLM to complete the init lifecycle, encoder-only doesn't require KV cache.
......
...@@ -29,24 +29,12 @@ VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [ ...@@ -29,24 +29,12 @@ VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [
profiled_vram_gib=9.6, profiled_vram_gib=9.6,
), ),
"e_pd": TopologyConfig( "e_pd": TopologyConfig(
marks=[ marks=[pytest.mark.pre_merge],
pytest.mark.skip(
reason="vLLM engine core init fails on disagg e_pd. "
"https://linear.app/nvidia/issue/OPS-4445"
),
pytest.mark.pre_merge,
],
timeout_s=340, timeout_s=340,
single_gpu=True, single_gpu=True,
), ),
"epd": TopologyConfig( "epd": TopologyConfig(
marks=[ marks=[pytest.mark.pre_merge],
pytest.mark.skip(
reason="vLLM engine core init fails on disagg epd. "
"https://linear.app/nvidia/issue/OPS-4445"
),
pytest.mark.pre_merge,
],
timeout_s=300, timeout_s=300,
single_gpu=True, single_gpu=True,
), ),
...@@ -68,13 +56,7 @@ VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [ ...@@ -68,13 +56,7 @@ VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [
delayed_start=60, delayed_start=60,
), ),
"epd": TopologyConfig( "epd": TopologyConfig(
marks=[ marks=[pytest.mark.pre_merge],
pytest.mark.skip(
reason="vLLM engine core init fails on disagg epd. "
"https://linear.app/nvidia/issue/OPS-4445"
),
pytest.mark.pre_merge,
],
timeout_s=600, timeout_s=600,
delayed_start=60, delayed_start=60,
single_gpu=True, single_gpu=True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment