fix(vllm): cap gpu_memory_utilization for encoder-only vision model load (#8466)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

fix(vllm): cap gpu_memory_utilization for encoder-only vision model load (#8466)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
af32579e · Qi Wang · GitHub · 2cd4288a · af32579e · af32579e
Unverified Commit af32579e authored Apr 22, 2026 by Qi Wang Committed by GitHub Apr 22, 2026
Showing with 6 additions and 21 deletions

components/src/dynamo/vllm/multimodal_utils/model.py components/src/dynamo/vllm/multimodal_utils/model.py +3 -0

tests/serve/multimodal_profiles/vllm.py tests/serve/multimodal_profiles/vllm.py +3 -21

No files found.
--- a/components/src/dynamo/vllm/multimodal_utils/model.py
+++ b/components/src/dynamo/vllm/multimodal_utils/model.py
@@ -173,6 +173,9 @@ def load_vision_model(model_id: str, enforce_eager: bool = False) -> torch.nn.Mo
        vllm_model = LLM(
            model=model_id,
            enforce_eager=enforce_eager,
+            # vLLM's free-memory precheck runs before kv_cache_memory_bytes applies;
+            # default 0.9 fails on <=24 GiB GPUs when another worker shares the device.
+            gpu_memory_utilization=0.2,
            kv_cache_memory_bytes=1024
            * 1024
            * 64,  # 64MB KV cache for vLLM to complete the init lifecycle, encoder-only doesn't require KV cache.

--- a/tests/serve/multimodal_profiles/vllm.py
+++ b/tests/serve/multimodal_profiles/vllm.py
@@ -29,24 +29,12 @@ VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [
                profiled_vram_gib=9.6,
            ),
            "e_pd": TopologyConfig(
-                marks=[
+                marks=[pytest.mark.pre_merge],
-                    pytest.mark.skip(
-                        reason="vLLM engine core init fails on disagg e_pd. "
-                        "https://linear.app/nvidia/issue/OPS-4445"
-                    ),
-                    pytest.mark.pre_merge,
-                ],
                timeout_s=340,
                single_gpu=True,
            ),
            "epd": TopologyConfig(
-                marks=[
+                marks=[pytest.mark.pre_merge],
-                    pytest.mark.skip(
-                        reason="vLLM engine core init fails on disagg epd. "
-                        "https://linear.app/nvidia/issue/OPS-4445"
-                    ),
-                    pytest.mark.pre_merge,
-                ],
                timeout_s=300,
                single_gpu=True,
            ),
@@ -68,13 +56,7 @@ VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [
                delayed_start=60,
            ),
            "epd": TopologyConfig(
-                marks=[
+                marks=[pytest.mark.pre_merge],
-                    pytest.mark.skip(
-                        reason="vLLM engine core init fails on disagg epd. "
-                        "https://linear.app/nvidia/issue/OPS-4445"
-                    ),
-                    pytest.mark.pre_merge,
-                ],
                timeout_s=600,
                delayed_start=60,
                single_gpu=True,