fix: Fix vllm multimodal qwen cuda oom issue (#3598)

9b0948c6 · Kris Hung · GitHub · cd2389ba · 9b0948c6
Unverified Commit 9b0948c6 authored Oct 13, 2025 by Kris Hung Committed by GitHub Oct 13, 2025
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 1 deletion

examples/multimodal/launch/agg.sh examples/multimodal/launch/agg.sh +7 -1

No files found.
--- a/examples/multimodal/launch/agg.sh
+++ b/examples/multimodal/launch/agg.sh
@@ -58,9 +58,15 @@ python -m dynamo.frontend --http-port=8000 &
 # run processor
 python3 components/processor.py --model $MODEL_NAME --prompt-template "$PROMPT_TEMPLATE" &

+# To make Qwen2.5-VL fit in A100 40GB, set the following extra arguments
+EXTRA_ARGS=""
+if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
+    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
+fi
+
 # run E/P/D workers
 CUDA_VISIBLE_DEVICES=0 python3 components/encode_worker.py --model $MODEL_NAME &
-CUDA_VISIBLE_DEVICES=1 python3 components/worker.py --model $MODEL_NAME --worker-type prefill &
+CUDA_VISIBLE_DEVICES=1 python3 components/worker.py --model $MODEL_NAME --worker-type prefill $EXTRA_ARGS &

 # Wait for all background processes to complete
 wait