agg_ec_connector.sh 2.41 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -e
trap 'echo Cleaning up...; kill 0' EXIT

# Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf"
EC_CONNECTOR_BACKEND="DynamoEcConnector"

# Parse command line arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL_NAME=$2
            shift 2
            ;;
        -h|--help)
            echo "Usage: $0 [OPTIONS]"
            echo ""
            echo "Aggregated multimodal serving with ECConnector (ec_both mode)"
            echo ""
            echo "This script launches:"
            echo "  - Frontend server"
            echo "  - Aggregated multimodal worker (ec_both: produces and consumes encoder cache)"
            echo ""
            echo "Options:"
            echo "  --model <model_name>  Specify the VLM model to use (default: $MODEL_NAME)"
            echo "  -h, --help            Show this help message"
            echo ""
            echo "Examples:"
            echo "  $0"
            echo "  $0 --model llava-hf/llava-1.5-7b-hf"
            echo ""
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            echo "Use --help for usage information"
            exit 1
            ;;
    esac
done

echo "=================================================="
echo "Aggregated Multimodal Serving (ECConnector ec_both)"
echo "=================================================="
echo "Model: $MODEL_NAME"
echo "ECConnector Backend: $EC_CONNECTOR_BACKEND"
echo "=================================================="

# GPU assignment (override via environment variable)
DYN_WORKER_GPU=${DYN_WORKER_GPU:-0}

# GPU memory utilization
DYN_GPU_MEM=${DYN_GPU_MEM:-0.85}

# Start frontend
echo "Starting frontend..."
python -m dynamo.frontend &

# Start aggregated multimodal worker (ec_both: produces and consumes encoder cache)
echo "Starting aggregated multimodal worker (ec_both) on GPU $DYN_WORKER_GPU (mem: $DYN_GPU_MEM)..."
CUDA_VISIBLE_DEVICES=$DYN_WORKER_GPU python -m dynamo.vllm \
    --multimodal-worker \
    --enable-multimodal \
    --model $MODEL_NAME \
    --enable-mm-embeds \
    --connector none \
    --enforce-eager \
    --gpu-memory-utilization $DYN_GPU_MEM \
    --ec-transfer-config "{\"ec_connector\":\"$EC_CONNECTOR_BACKEND\",\"ec_role\":\"ec_both\"}" &

# Wait for all background processes to complete
wait