run_engines.sh 6.55 KB
Newer Older
Yan Ru Pei's avatar
Yan Ru Pei committed
1
2
3
4
5
6
7
8
9
10
#!/bin/bash

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Parse command-line arguments
NUM_WORKERS=8
MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
TENSOR_PARALLEL_SIZE=1
USE_MOCKERS=false
11
12
USE_TRTLLM=false
MODE="agg"  # Options: agg (default), decode, prefill
Yan Ru Pei's avatar
Yan Ru Pei committed
13
BASE_GPU_OFFSET=0
Yan Ru Pei's avatar
Yan Ru Pei committed
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
EXTRA_ARGS=()

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --num-workers)
            NUM_WORKERS="$2"
            shift 2
            ;;
        --model-path)
            MODEL_PATH="$2"
            shift 2
            ;;
        --tensor-parallel-size)
            TENSOR_PARALLEL_SIZE="$2"
            shift 2
            ;;
        --mockers)
            USE_MOCKERS=true
            shift
            ;;
35
36
37
38
39
40
41
42
43
44
        --trtllm)
            USE_TRTLLM=true
            shift
            ;;
        --prefill)
            MODE="prefill"
            shift
            ;;
        --decode)
            MODE="decode"
Yan Ru Pei's avatar
Yan Ru Pei committed
45
46
47
48
49
50
            shift
            ;;
        --base-gpu-offset)
            BASE_GPU_OFFSET="$2"
            shift 2
            ;;
Yan Ru Pei's avatar
Yan Ru Pei committed
51
52
53
54
55
56
        --)
            shift
            EXTRA_ARGS+=("$@")
            break
            ;;
        *)
57
            # Collect all other arguments as vLLM/mocker/trtllm arguments
Yan Ru Pei's avatar
Yan Ru Pei committed
58
59
60
61
62
63
            EXTRA_ARGS+=("$1")
            shift
            ;;
    esac
done

64
65
66
67
68
69
70
71
72
# Validate that only one engine type is selected
ENGINE_COUNT=0
[ "$USE_MOCKERS" = true ] && ((ENGINE_COUNT++))
[ "$USE_TRTLLM" = true ] && ((ENGINE_COUNT++))
if [ "$ENGINE_COUNT" -gt 1 ]; then
    echo "Error: Only one engine type (--mockers, --trtllm, or default vLLM) can be specified"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
73
74
75
76
77
78
79
# If no extra args provided, use defaults
if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then
    if [ "$USE_MOCKERS" = true ]; then
        # Default args for mocker engine (only block-size needed as others are defaults)
        EXTRA_ARGS=(
            "--block-size" "64"
        )
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    elif [ "$USE_TRTLLM" = true ]; then
        # Default args for TensorRT-LLM engine using predefined YAML configs
        # Config files located at: ../../components/backends/trtllm/engine_configs/{agg,decode,prefill}.yaml
        if [ "$MODE" = "prefill" ]; then
            ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/prefill.yaml"
        elif [ "$MODE" = "decode" ]; then
            ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/decode.yaml"
        else
            ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/agg.yaml"
        fi

        EXTRA_ARGS=(
            "--extra-engine-args" "$ENGINE_CONFIG"
            "--publish-events-and-metrics"
        )
Yan Ru Pei's avatar
Yan Ru Pei committed
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    else
        # Default args for vLLM engine (explicitly include block-size)
        EXTRA_ARGS=(
            "--enforce-eager"
            "--max-num-batched-tokens" "16384"
            "--max-model-len" "32768"
            "--block-size" "64"
        )
    fi
fi

# Validate arguments
if ! [[ "$NUM_WORKERS" =~ ^[0-9]+$ ]] || [ "$NUM_WORKERS" -lt 1 ]; then
    echo "Error: NUM_WORKERS must be a positive integer"
    exit 1
fi

if ! [[ "$TENSOR_PARALLEL_SIZE" =~ ^[0-9]+$ ]] || [ "$TENSOR_PARALLEL_SIZE" -lt 1 ]; then
    echo "Error: TENSOR_PARALLEL_SIZE must be a positive integer"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
117
118
119
120
121
if ! [[ "$BASE_GPU_OFFSET" =~ ^[0-9]+$ ]]; then
    echo "Error: BASE_GPU_OFFSET must be a non-negative integer"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
122
123
# Calculate total GPUs needed
TOTAL_GPUS_NEEDED=$((NUM_WORKERS * TENSOR_PARALLEL_SIZE))
Yan Ru Pei's avatar
Yan Ru Pei committed
124
LAST_GPU=$((BASE_GPU_OFFSET + TOTAL_GPUS_NEEDED - 1))
Yan Ru Pei's avatar
Yan Ru Pei committed
125
echo "Configuration:"
126
127
128
129
130
131
132
133
134
if [ "$USE_MOCKERS" = true ]; then
    ENGINE_TYPE="Mocker"
elif [ "$USE_TRTLLM" = true ]; then
    ENGINE_TYPE="TensorRT-LLM"
else
    ENGINE_TYPE="vLLM"
fi
echo "  Engine Type: $ENGINE_TYPE"
echo "  Mode: $MODE"
Yan Ru Pei's avatar
Yan Ru Pei committed
135
136
137
138
echo "  Workers: $NUM_WORKERS"
echo "  Model: $MODEL_PATH"
echo "  Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
echo "  Total GPUs needed: $TOTAL_GPUS_NEEDED"
Yan Ru Pei's avatar
Yan Ru Pei committed
139
echo "  GPU Range: $BASE_GPU_OFFSET-$LAST_GPU"
Yan Ru Pei's avatar
Yan Ru Pei committed
140
141
142
143
144
145
146
147
148
149
150
151
152
153
echo "  Engine args: ${EXTRA_ARGS[*]}"
echo ""

PIDS=()

cleanup() {
    echo -e "\nStopping all workers..."
    kill "${PIDS[@]}" 2>/dev/null
    wait
    exit 0
}

trap cleanup SIGINT SIGTERM

154
echo "Starting $NUM_WORKERS $MODE workers..."
Yan Ru Pei's avatar
Yan Ru Pei committed
155
156
157

for i in $(seq 1 $NUM_WORKERS); do
    {
158
        echo "[${MODE^} Worker-$i] Starting..."
Yan Ru Pei's avatar
Yan Ru Pei committed
159

Yan Ru Pei's avatar
Yan Ru Pei committed
160
161
        # Calculate GPU indices for this worker (with base offset)
        START_GPU=$(( BASE_GPU_OFFSET + (i - 1) * TENSOR_PARALLEL_SIZE ))
Yan Ru Pei's avatar
Yan Ru Pei committed
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
        END_GPU=$(( START_GPU + TENSOR_PARALLEL_SIZE - 1 ))

        # Build CUDA_VISIBLE_DEVICES string
        if [ "$TENSOR_PARALLEL_SIZE" -eq 1 ]; then
            GPU_DEVICES="$START_GPU"
        else
            GPU_DEVICES=""
            for gpu in $(seq $START_GPU $END_GPU); do
                if [ -n "$GPU_DEVICES" ]; then
                    GPU_DEVICES="${GPU_DEVICES},$gpu"
                else
                    GPU_DEVICES="$gpu"
                fi
            done
        fi

        if [ "$USE_MOCKERS" = true ]; then
            # Run mocker engine (no GPU assignment needed)
            exec python -m dynamo.mocker \
                --model-path "$MODEL_PATH" \
                --endpoint dyn://test.mocker.generate \
                "${EXTRA_ARGS[@]}"
184
185
186
187
188
189
190
191
192
193
194
195
196
        elif [ "$USE_TRTLLM" = true ]; then
            echo "[${MODE^} Worker-$i] Using GPUs: $GPU_DEVICES"
            # Run TensorRT-LLM engine with trtllm-llmapi-launch for proper initialization
            TRTLLM_ARGS=()
            TRTLLM_ARGS+=("--model-path" "$MODEL_PATH")
            TRTLLM_ARGS+=("--tensor-parallel-size" "$TENSOR_PARALLEL_SIZE")
            if [ "$MODE" != "agg" ]; then
                TRTLLM_ARGS+=("--disaggregation-mode" "$MODE")
            fi
            TRTLLM_ARGS+=("${EXTRA_ARGS[@]}")

            exec env CUDA_VISIBLE_DEVICES=$GPU_DEVICES trtllm-llmapi-launch python -m dynamo.trtllm \
                "${TRTLLM_ARGS[@]}"
Yan Ru Pei's avatar
Yan Ru Pei committed
197
        else
198
            echo "[${MODE^} Worker-$i] Using GPUs: $GPU_DEVICES"
199
            # Run vLLM engine with PYTHONHASHSEED=0 for deterministic event IDs in KV-aware routing
Yan Ru Pei's avatar
Yan Ru Pei committed
200
201
202
            VLLM_ARGS=()
            VLLM_ARGS+=("--model" "$MODEL_PATH")
            VLLM_ARGS+=("--tensor-parallel-size" "$TENSOR_PARALLEL_SIZE")
203
            if [ "$MODE" = "prefill" ]; then
Yan Ru Pei's avatar
Yan Ru Pei committed
204
205
206
207
                VLLM_ARGS+=("--is-prefill-worker")
            fi
            VLLM_ARGS+=("${EXTRA_ARGS[@]}")

208
            exec env PYTHONHASHSEED=0 CUDA_VISIBLE_DEVICES=$GPU_DEVICES python -m dynamo.vllm \
Yan Ru Pei's avatar
Yan Ru Pei committed
209
                "${VLLM_ARGS[@]}"
Yan Ru Pei's avatar
Yan Ru Pei committed
210
211
212
        fi
    } &
    PIDS+=($!)
213
    echo "Started $MODE worker $i (PID: $!)"
Yan Ru Pei's avatar
Yan Ru Pei committed
214
215
216
217
218
done

echo "All workers started. Press Ctrl+C to stop."
wait
echo "All workers completed."