disagg_multimodal_e_pd.sh 4.24 KB
Newer Older
1
2
3
4
5
6
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT

7
8
9
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Default values
MODEL_NAME="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
SINGLE_GPU=false

# Parse command line arguments
# All extra arguments are passed through to the PD worker's dynamo.vllm
# (which routes them to Dynamo or vLLM as appropriate).
EXTRA_PD_ARGS=()
while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL_NAME=$2
            shift 2
            ;;
        --single-gpu)
            SINGLE_GPU=true
            shift
            ;;
        -h|--help)
            echo "Usage: $0 [OPTIONS] [EXTRA_ARGS...]"
            echo ""
            echo "Disaggregated multimodal serving with separate Encode and aggregated PD worker"
            echo ""
            echo "Options:"
            echo "  --model <model_name>          Specify the VLM model to use (default: $MODEL_NAME)"
            echo "                                LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
            echo "  --single-gpu                  Run encode and PD workers on the same GPU (for small models, e.g. 2B)"
            echo "  -h, --help                    Show this help message"
            echo ""
            echo "All additional arguments are passed through to the PD worker's dynamo.vllm."
            echo "Dynamo args (e.g. --multimodal-embedding-cache-capacity-gb) and"
            echo "vLLM engine args (e.g. --no-enable-prefix-caching) are automatically routed."
            echo ""
            echo "Examples:"
            echo "  $0 --model llava-hf/llava-1.5-7b-hf"
            echo "  $0 --model microsoft/Phi-3.5-vision-instruct"
            echo "  $0 --model Qwen/Qwen2.5-VL-7B-Instruct"
            echo "  $0 --no-enable-prefix-caching --multimodal-embedding-cache-capacity-gb 2"
            echo "  $0 --model Qwen/Qwen2-VL-2B-Instruct --single-gpu"
            echo ""
            exit 0
            ;;
        *)
            EXTRA_PD_ARGS+=("$1")
            shift
            ;;
    esac
done


PD_MAX_MODEL_LEN="16384"


63
64
65
66
67
68
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
if [[ "$SINGLE_GPU" == "true" ]]; then
    GPU_LABEL="1 GPU"
else
    GPU_LABEL="2 GPUs"
fi
69
print_launch_banner --multimodal "Launching Disaggregated Multimodal E+PD ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT"
70
71
72
73
74
75
76
77
78


# Start frontend (no router mode)
echo "Starting frontend..."
python -m dynamo.frontend &

EXTRA_ARGS=""

# GPU assignments (override via environment variables)
79
# TODO: use build_gpu_mem_args to measure VRAM instead of hardcoded fractions
80
# In single-GPU mode both workers share the same GPU.
81
82
83
if [[ "$SINGLE_GPU" == "true" ]]; then
    DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
    DYN_PD_WORKER_GPU=${DYN_PD_WORKER_GPU:-0}
84
85
    DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.1}
    DYN_PD_GPU_MEM=${DYN_PD_GPU_MEM:-0.7}
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    EXTRA_ARGS="--enforce-eager"
else
    DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-1}
    DYN_PD_WORKER_GPU=${DYN_PD_WORKER_GPU:-2}
    DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
    DYN_PD_GPU_MEM=${DYN_PD_GPU_MEM:-0.9}
fi

# Start encode worker
echo "Starting encode worker on GPU $DYN_ENCODE_WORKER_GPU (GPU mem: $DYN_ENCODE_GPU_MEM)..."
CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU \
python -m dynamo.vllm \
  --multimodal-encode-worker \
  --enable-multimodal \
  --model "$MODEL_NAME" \
  --gpu-memory-utilization "$DYN_ENCODE_GPU_MEM" \
  $EXTRA_ARGS &

# Start PD worker (aggregated prefill+decode, routes to encoder for embeddings)
echo "Starting PD worker on GPU $DYN_PD_WORKER_GPU (GPU mem: $DYN_PD_GPU_MEM)..."
CUDA_VISIBLE_DEVICES=$DYN_PD_WORKER_GPU \
python -m dynamo.vllm \
  --route-to-encoder \
  --enable-multimodal \
  --enable-mm-embeds \
  --model "$MODEL_NAME" \
  --max-model-len "$PD_MAX_MODEL_LEN" \
  --gpu-memory-utilization "$DYN_PD_GPU_MEM" \
  $EXTRA_ARGS \
  "${EXTRA_PD_ARGS[@]}" &

echo "=================================================="
echo "All components started. Waiting for initialization..."
echo "=================================================="

121
122
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit