fix: VLLM Multimodal minor fixes (#5748)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>

fix: VLLM Multimodal minor fixes (#5748)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
842f0f15 · Indrajit Bhosale · GitHub · c5e30afb · 842f0f15 · 842f0f15
Unverified Commit 842f0f15 authored Jan 28, 2026 by Indrajit Bhosale Committed by GitHub Jan 29, 2026
Showing with 17 additions and 4 deletions

examples/backends/vllm/launch/agg_multimodal_epd.sh examples/backends/vllm/launch/agg_multimodal_epd.sh +12 -2

examples/multimodal/utils/args.py examples/multimodal/utils/args.py +5 -2

No files found.
--- a/examples/backends/vllm/launch/agg_multimodal_epd.sh
+++ b/examples/backends/vllm/launch/agg_multimodal_epd.sh
@@ -50,15 +50,25 @@ done
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &
+# Set max model length based on model name
+MAX_MODEL_LEN=""
+if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
+    MAX_MODEL_LEN="4096"
+elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
+    MAX_MODEL_LEN="2048"
+else
+    MAX_MODEL_LEN="30426"
+fi
 # Set GPU memory utilization and model length based on deployment mode
 # Single-GPU mode: Both workers share GPU 0, so use reduced memory settings
 # Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
 EXTRA_ARGS=""
 if [[ "$SINGLE_GPU" == "true" ]]; then
-    EXTRA_ARGS="--gpu-memory-utilization 0.4 --enforce-eager --max-model-len 30426"
+    EXTRA_ARGS="--gpu-memory-utilization 0.4 --enforce-eager --max-model-len $MAX_MODEL_LEN"
 else
    # Multi-GPU mode: standard memory settings
-    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 30426"
+    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len $MAX_MODEL_LEN"
 fi
 # Start processor (Python-based preprocessing, handles prompt templating)

--- a/examples/multimodal/utils/args.py
+++ b/examples/multimodal/utils/args.py
@@ -153,7 +153,8 @@ def overwrite_args(config):
    dp_rank = config.engine_args.data_parallel_rank or 0
    defaults = {
-        "task": "generate",
+        # vLLM 0.13+ renamed 'task' to 'runner'
+        "runner": "generate",
        "skip_tokenizer_init": False,
        "enable_log_requests": False,
        "enable_prefix_caching": True,
@@ -178,4 +179,6 @@ def overwrite_args(config):
            setattr(config.engine_args, key, value)
            logger.debug(f" engine_args.{key} = {value}")
        else:
-            raise ValueError(f"{key} not found in AsyncEngineArgs from vLLM.")
+            logger.debug(
+                f" Skipping engine_args.{key} (not available in this vLLM version)"
+            )