agg_multimodal.sh 3.88 KB
Newer Older
1
#!/bin/bash
2
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
# SPDX-License-Identifier: Apache-2.0
4
5
6
7
8
9
10
11
12
13
#
# Aggregated multimodal serving with standard Dynamo preprocessing
#
# Architecture: Single-worker PD (Prefill-Decode)
# - Frontend: Rust OpenAIPreprocessor handles image URLs (HTTP and data:// base64)
# - Worker: Standard vLLM worker with vision model support
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh

14
15
16
17
set -e
trap 'echo Cleaning up...; kill 0' EXIT

# Default values
18
MODEL_NAME="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
19
20

# Parse command line arguments
21
22
# Extra arguments are passed through to the vLLM worker
EXTRA_ARGS=()
23
24
25
26
27
28
29
while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL_NAME=$2
            shift 2
            ;;
        -h|--help)
30
            echo "Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]"
31
            echo "Options:"
32
33
34
35
36
            echo "  --model <model_name>   Specify the VLM model to use (default: $MODEL_NAME)"
            echo "  -h, --help             Show this help message"
            echo ""
            echo "Any additional arguments are passed through to the vLLM worker."
            echo "Example: $0 --model Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 --dyn-tool-call-parser hermes"
37
38
39
            exit 0
            ;;
        *)
40
41
            EXTRA_ARGS+=("$1")
            shift
42
43
44
45
            ;;
    esac
done

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
echo "=========================================="
echo "Launching Aggregated Multimodal Serving"
echo "=========================================="
echo "Model:       $MODEL_NAME"
echo "Frontend:    http://localhost:$HTTP_PORT"
echo "=========================================="
echo ""
echo "Example test command:"
echo ""
echo "  curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo "    -H 'Content-Type: application/json' \\"
echo "    -d '{"
echo "      \"model\": \"${MODEL_NAME}\","
echo "      \"messages\": [{"
echo "        \"role\": \"user\","
echo "        \"content\": ["
echo "          {\"type\": \"text\", \"text\": \"Describe the image.\"},"
echo "          {\"type\": \"image_url\", \"image_url\": {\"url\": \"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/480px-Cat03.jpg\"}}"
echo "        ]"
echo "      }],"
echo "      \"max_tokens\": 50"
echo "    }'"
echo ""
echo "=========================================="

72
73
74
75
76
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
export DYN_REQUEST_PLANE=tcp

77
# Start frontend with Rust OpenAIPreprocessor
78
79
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
80

81
# Configure GPU memory optimization for specific models (if no extra args override)
82
MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 16384"
83
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
84
    MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
85
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
86
    MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
87
88
elif [[ "$MODEL_NAME" == "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" ]]; then
    MODEL_SPECIFIC_ARGS="--tensor-parallel-size=8 --gpu-memory-utilization 0.85 --max-model-len=108960"
89
90
fi

91
92
93
# Start vLLM worker with vision model
# Multimodal data (images) are decoded in the backend worker using ImageLoader
# --enforce-eager: Quick deployment (remove for production)
94
# Extra args from command line come last to allow overrides
95
CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0} \
96
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
97
    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME $MODEL_SPECIFIC_ARGS "${EXTRA_ARGS[@]}"
98
99
100

# Wait for all background processes to complete
wait
101
102