agg_multimodal.sh 4.13 KB
Newer Older
1
#!/bin/bash
2
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
# SPDX-License-Identifier: Apache-2.0
4
#
5
# Aggregated multimodal image/video serving with standard Dynamo preprocessing
6
7
#
# Architecture: Single-worker PD (Prefill-Decode)
8
9
# - Frontend: Rust OpenAIPreprocessor forwards multimodal requests
# - Worker: Standard vLLM worker with multimodal model support
10
11
12
13
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh

14
15
16
set -e
trap 'echo Cleaning up...; kill 0' EXIT

17
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
18
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
19
20
source "$SCRIPT_DIR/../../../common/launch_utils.sh"

21
# Default values
22
MODEL_NAME="${DYN_MODEL_NAME:-Qwen/Qwen3-VL-30B-A3B-Instruct-FP8}"
23
24

# Parse command line arguments
25
26
# Extra arguments are passed through to the vLLM worker
EXTRA_ARGS=()
27
28
29
30
31
32
33
while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL_NAME=$2
            shift 2
            ;;
        -h|--help)
34
            echo "Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]"
35
            echo "Options:"
36
37
38
39
40
            echo "  --model <model_name>   Specify the VLM model to use (default: $MODEL_NAME)"
            echo "  -h, --help             Show this help message"
            echo ""
            echo "Any additional arguments are passed through to the vLLM worker."
            echo "Example: $0 --model Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 --dyn-tool-call-parser hermes"
41
42
43
            exit 0
            ;;
        *)
44
45
            EXTRA_ARGS+=("$1")
            shift
46
47
48
49
            ;;
    esac
done

50
51
HTTP_PORT="${DYN_HTTP_PORT:-8000}"

52
53
54
55
56
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
export DYN_REQUEST_PLANE=tcp

57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
print_launch_banner --no-curl "Launching Aggregated Multimodal Serving" "$MODEL_NAME" "$HTTP_PORT" \
    "Backend:     dynamo.vllm --enable-multimodal" \
    "Media:       image_url and video_url (model support dependent)"

print_curl_footer <<CURL
  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
    -H 'Content-Type: application/json' \\
    -d '{
      "model": "${MODEL_NAME}",
      "messages": [{"role": "user", "content": [
        {"type": "text", "text": "Describe the image"},
        {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png"}}
      ]}],
      "max_tokens": 50
    }'

  # For video-capable models such as Qwen/Qwen3-VL-2B-Instruct:
  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
    -H 'Content-Type: application/json' \\
    -d '{
      "model": "Qwen/Qwen3-VL-2B-Instruct",
      "messages": [{"role": "user", "content": [
        {"type": "text", "text": "Describe the video in detail"},
        {"type": "video_url", "video_url": {"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"}}
      ]}],
      "max_tokens": 128
    }'
CURL

86
# Start frontend with Rust OpenAIPreprocessor
87
88
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
89

90
91
92
93
94
95
# ---- Per-model defaults ----
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
MODEL_EXTRA_ARGS=""
case "$MODEL_NAME" in
    meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)
96
        MAX_MODEL_LEN="108960"
97
98
99
        MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac

100
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
101

102
103
# Start vLLM worker with vision model
# --enforce-eager: Quick deployment (remove for production)
104
# Extra args from command line come last to allow overrides
105
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} \
106
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
107
108
109
    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME \
    --max-model-len "$MAX_MODEL_LEN" \
    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
110
111
112
    $GPU_MEM_ARGS \
    $MODEL_EXTRA_ARGS \
    "${EXTRA_ARGS[@]}"
113

114
115
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit