agg_multimodal.sh 2 KB
Newer Older
1
2
3
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
4
5
6
7
8
9
10
11
12
13
#
# Aggregated multimodal serving with standard Dynamo preprocessing
#
# Architecture: Single-worker PD (Prefill-Decode)
# - Frontend: Rust OpenAIPreprocessor handles image URLs (HTTP and data:// base64)
# - Worker: Standard vLLM worker with vision model support
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh

14
15
16
17
set -e
trap 'echo Cleaning up...; kill 0' EXIT

# Default values
18
MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
19
20
21
22
23
24
25
26
27
28
29

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL_NAME=$2
            shift 2
            ;;
        -h|--help)
            echo "Usage: $0 [OPTIONS]"
            echo "Options:"
30
            echo "  --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
31
32
33
34
35
36
37
38
39
40
41
            echo "  -h, --help           Show this help message"
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            echo "Use --help for usage information"
            exit 1
            ;;
    esac
done

42
# Start frontend with Rust OpenAIPreprocessor
43
python -m dynamo.frontend --http-port=8000 &
44

45
# Configure GPU memory optimization for specific models
46
47
48
EXTRA_ARGS=""
if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
49
50
elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
51
52
fi

53
54
55
56
# Start vLLM worker with vision model
# Multimodal data (images) are decoded in the backend worker using ImageLoader
# --enforce-eager: Quick deployment (remove for production)
# --connector none: No KV transfer needed for aggregated serving
57
DYN_SYSTEM_PORT=8081 \
58
    python -m dynamo.vllm --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
59
60
61

# Wait for all background processes to complete
wait
62
63