agg_multimodal.sh 3.01 KB
Newer Older
1
#!/bin/bash
2
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
# SPDX-License-Identifier: Apache-2.0
4
5
6
7
8
9
10
11
12
13
#
# Aggregated multimodal serving with standard Dynamo preprocessing
#
# Architecture: Single-worker PD (Prefill-Decode)
# - Frontend: Rust OpenAIPreprocessor handles image URLs (HTTP and data:// base64)
# - Worker: Standard vLLM worker with vision model support
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh

14
15
16
set -e
trap 'echo Cleaning up...; kill 0' EXIT

17
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
18
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
19
20
source "$SCRIPT_DIR/../../../common/launch_utils.sh"

21
# Default values
22
MODEL_NAME="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
23
24

# Parse command line arguments
25
26
# Extra arguments are passed through to the vLLM worker
EXTRA_ARGS=()
27
28
29
30
31
32
33
while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL_NAME=$2
            shift 2
            ;;
        -h|--help)
34
            echo "Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]"
35
            echo "Options:"
36
37
38
39
40
            echo "  --model <model_name>   Specify the VLM model to use (default: $MODEL_NAME)"
            echo "  -h, --help             Show this help message"
            echo ""
            echo "Any additional arguments are passed through to the vLLM worker."
            echo "Example: $0 --model Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 --dyn-tool-call-parser hermes"
41
42
43
            exit 0
            ;;
        *)
44
45
            EXTRA_ARGS+=("$1")
            shift
46
47
48
49
            ;;
    esac
done

50
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
51
print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL_NAME" "$HTTP_PORT"
52

53
54
55
56
57
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
export DYN_REQUEST_PLANE=tcp

58
# Start frontend with Rust OpenAIPreprocessor
59
60
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
61

62
63
64
65
66
67
68
69
70
71
# ---- Per-model defaults ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
MODEL_EXTRA_ARGS=""
case "$MODEL_NAME" in
    meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)
        MAX_MODEL_LEN="${MAX_MODEL_LEN:-108960}"
        MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac

72
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
73

74
75
# Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production)
76
# Extra args from command line come last to allow overrides
77
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} \
78
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
79
80
81
    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \
    --max-model-len "$MAX_MODEL_LEN" \
    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
82
83
84
    $GPU_MEM_ARGS \
    $MODEL_EXTRA_ARGS \
    "${EXTRA_ARGS[@]}"
85

86
87
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit