agg_multimodal_epd.sh 3.58 KB
Newer Older
1
#!/bin/bash
2
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# SPDX-License-Identifier: Apache-2.0
#
# EPD (Encode-Prefill-Decode) multimodal deployment
#
# Architecture: 3-component disaggregation
# - Processor: Python-based preprocessor (bypasses Rust OpenAIPreprocessor)
# - Encode Worker: Dedicated vision encoder that extracts image embeddings
# - PD Worker: Standard prefill/decode worker that receives embeddings via NIXL
#
# Benefits: Decouples encoding from inference, enables independent scaling
# For standard single-worker deployment, see agg_multimodal.sh

set -e
trap 'echo Cleaning up...; kill 0' EXIT

# Default values
GuanLuo's avatar
GuanLuo committed
19
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
20
SINGLE_GPU=false
21
22
23
24
25
26
27
28

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL_NAME=$2
            shift 2
            ;;
29
30
31
32
        --single-gpu)
            SINGLE_GPU=true
            shift
            ;;
33
34
35
36
        -h|--help)
            echo "Usage: $0 [OPTIONS]"
            echo "Options:"
            echo "  --model <model_name> Specify the model to use (default: $MODEL_NAME)"
37
            echo "  --single-gpu         Run both encode and PD workers on GPU 0 (for pre-merge CI)"
38
39
40
41
42
43
44
45
46
47
48
49
            echo "  -h, --help           Show this help message"
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            echo "Use --help for usage information"
            exit 1
            ;;
    esac
done

# Start frontend (HTTP endpoint)
50
51
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
52

53
54
55
56
57
58
59
60
61
62
# Set max model length based on model name
MAX_MODEL_LEN=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
    MAX_MODEL_LEN="4096"
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
    MAX_MODEL_LEN="2048"
else
    MAX_MODEL_LEN="30426"
fi

63
64
65
# Set GPU memory utilization and model length based on deployment mode
# Single-GPU mode: Both workers share GPU 0, so use reduced memory settings
# Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
66
EXTRA_ARGS=""
67
if [[ "$SINGLE_GPU" == "true" ]]; then
68
    EXTRA_ARGS="--gpu-memory-utilization 0.4 --enforce-eager --max-model-len $MAX_MODEL_LEN"
69
70
else
    # Multi-GPU mode: standard memory settings
71
    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len $MAX_MODEL_LEN"
72
73
74
fi

# Start processor (Python-based preprocessing, handles prompt templating)
GuanLuo's avatar
GuanLuo committed
75
python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME &
76
77

# run E/P/D workers
78
79
80
81
# Use single GPU (GPU 0) for pre-merge CI, otherwise use GPU 0 for encode and GPU 1 for PD
if [[ "$SINGLE_GPU" == "true" ]]; then
    # Single GPU mode: both workers share GPU 0 with reduced memory
    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
82
83
84
    # Now that encode worker and PD worker are vLLM engine, need to ensure encode worker and PD worker are not initialized concurrently
    # on the same GPU to avoid influencing each other's startup process (checks and allocations).
    sleep 60
85
86
    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
else
87
88
    CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-worker --enable-multimodal --enable-mm-embeds --model $MODEL_NAME $EXTRA_ARGS &
    CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
89
fi
90
91
92

# Wait for all background processes to complete
wait