run_engines.sh 9.16 KB
Newer Older
Yan Ru Pei's avatar
Yan Ru Pei committed
1
2
#!/bin/bash

3
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Yan Ru Pei's avatar
Yan Ru Pei committed
4
5
6
# SPDX-License-Identifier: Apache-2.0

# Parse command-line arguments
7
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
Yan Ru Pei's avatar
Yan Ru Pei committed
8
9
NUM_WORKERS=8
MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
10
ENGINE_CONFIG_PATH="$DYNAMO_HOME/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b"
Yan Ru Pei's avatar
Yan Ru Pei committed
11
TENSOR_PARALLEL_SIZE=1
Yan Ru Pei's avatar
Yan Ru Pei committed
12
DATA_PARALLEL_SIZE=1
Yan Ru Pei's avatar
Yan Ru Pei committed
13
USE_MOCKERS=false
14
15
USE_TRTLLM=false
MODE="agg"  # Options: agg (default), decode, prefill
Yan Ru Pei's avatar
Yan Ru Pei committed
16
BASE_GPU_OFFSET=0
Yan Ru Pei's avatar
Yan Ru Pei committed
17
REASONING=""
Yan Ru Pei's avatar
Yan Ru Pei committed
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
EXTRA_ARGS=()

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --num-workers)
            NUM_WORKERS="$2"
            shift 2
            ;;
        --model-path)
            MODEL_PATH="$2"
            shift 2
            ;;
        --tensor-parallel-size)
            TENSOR_PARALLEL_SIZE="$2"
            shift 2
            ;;
Yan Ru Pei's avatar
Yan Ru Pei committed
35
36
37
38
        --data-parallel-size)
            DATA_PARALLEL_SIZE="$2"
            shift 2
            ;;
Yan Ru Pei's avatar
Yan Ru Pei committed
39
40
41
42
        --mockers)
            USE_MOCKERS=true
            shift
            ;;
43
44
45
46
47
48
49
50
51
52
        --trtllm)
            USE_TRTLLM=true
            shift
            ;;
        --prefill)
            MODE="prefill"
            shift
            ;;
        --decode)
            MODE="decode"
Yan Ru Pei's avatar
Yan Ru Pei committed
53
54
55
56
57
58
            shift
            ;;
        --base-gpu-offset)
            BASE_GPU_OFFSET="$2"
            shift 2
            ;;
Yan Ru Pei's avatar
Yan Ru Pei committed
59
60
61
62
        --reasoning)
            REASONING="$2"
            shift 2
            ;;
Yan Ru Pei's avatar
Yan Ru Pei committed
63
64
65
66
67
68
        --)
            shift
            EXTRA_ARGS+=("$@")
            break
            ;;
        *)
69
            # Collect all other arguments as vLLM/mocker/trtllm arguments
Yan Ru Pei's avatar
Yan Ru Pei committed
70
71
72
73
74
75
            EXTRA_ARGS+=("$1")
            shift
            ;;
    esac
done

76
77
78
79
80
81
82
83
84
# Validate that only one engine type is selected
ENGINE_COUNT=0
[ "$USE_MOCKERS" = true ] && ((ENGINE_COUNT++))
[ "$USE_TRTLLM" = true ] && ((ENGINE_COUNT++))
if [ "$ENGINE_COUNT" -gt 1 ]; then
    echo "Error: Only one engine type (--mockers, --trtllm, or default vLLM) can be specified"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
85
86
87
88
89
90
91
# If no extra args provided, use defaults
if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then
    if [ "$USE_MOCKERS" = true ]; then
        # Default args for mocker engine (only block-size needed as others are defaults)
        EXTRA_ARGS=(
            "--block-size" "64"
        )
92
93
    elif [ "$USE_TRTLLM" = true ]; then
        # Default args for TensorRT-LLM engine using predefined YAML configs
94
        # Config files located at: $ENGINE_CONFIG_PATH/{agg,decode,prefill}.yaml
95
        if [ "$MODE" = "prefill" ]; then
96
            ENGINE_CONFIG="$ENGINE_CONFIG_PATH/prefill.yaml"
97
        elif [ "$MODE" = "decode" ]; then
98
            ENGINE_CONFIG="$ENGINE_CONFIG_PATH/decode.yaml"
99
        else
100
            ENGINE_CONFIG="$ENGINE_CONFIG_PATH/agg.yaml"
101
102
103
104
105
106
        fi

        EXTRA_ARGS=(
            "--extra-engine-args" "$ENGINE_CONFIG"
            "--publish-events-and-metrics"
        )
Yan Ru Pei's avatar
Yan Ru Pei committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
    else
        # Default args for vLLM engine (explicitly include block-size)
        EXTRA_ARGS=(
            "--enforce-eager"
            "--max-num-batched-tokens" "16384"
            "--max-model-len" "32768"
            "--block-size" "64"
        )
    fi
fi

# Validate arguments
if ! [[ "$NUM_WORKERS" =~ ^[0-9]+$ ]] || [ "$NUM_WORKERS" -lt 1 ]; then
    echo "Error: NUM_WORKERS must be a positive integer"
    exit 1
fi

if ! [[ "$TENSOR_PARALLEL_SIZE" =~ ^[0-9]+$ ]] || [ "$TENSOR_PARALLEL_SIZE" -lt 1 ]; then
    echo "Error: TENSOR_PARALLEL_SIZE must be a positive integer"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
129
130
131
132
133
if ! [[ "$DATA_PARALLEL_SIZE" =~ ^[0-9]+$ ]] || [ "$DATA_PARALLEL_SIZE" -lt 1 ]; then
    echo "Error: DATA_PARALLEL_SIZE must be a positive integer"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
134
135
136
137
138
if ! [[ "$BASE_GPU_OFFSET" =~ ^[0-9]+$ ]]; then
    echo "Error: BASE_GPU_OFFSET must be a non-negative integer"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
139
140
141
# Calculate total GPUs needed (TP * DP per worker)
GPUS_PER_WORKER=$((TENSOR_PARALLEL_SIZE * DATA_PARALLEL_SIZE))
TOTAL_GPUS_NEEDED=$((NUM_WORKERS * GPUS_PER_WORKER))
Yan Ru Pei's avatar
Yan Ru Pei committed
142
LAST_GPU=$((BASE_GPU_OFFSET + TOTAL_GPUS_NEEDED - 1))
Yan Ru Pei's avatar
Yan Ru Pei committed
143
echo "Configuration:"
144
145
146
147
148
149
150
151
152
if [ "$USE_MOCKERS" = true ]; then
    ENGINE_TYPE="Mocker"
elif [ "$USE_TRTLLM" = true ]; then
    ENGINE_TYPE="TensorRT-LLM"
else
    ENGINE_TYPE="vLLM"
fi
echo "  Engine Type: $ENGINE_TYPE"
echo "  Mode: $MODE"
Yan Ru Pei's avatar
Yan Ru Pei committed
153
154
155
echo "  Workers: $NUM_WORKERS"
echo "  Model: $MODEL_PATH"
echo "  Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
Yan Ru Pei's avatar
Yan Ru Pei committed
156
157
echo "  Data Parallel Size: $DATA_PARALLEL_SIZE"
echo "  GPUs per worker: $GPUS_PER_WORKER"
Yan Ru Pei's avatar
Yan Ru Pei committed
158
echo "  Total GPUs needed: $TOTAL_GPUS_NEEDED"
Yan Ru Pei's avatar
Yan Ru Pei committed
159
echo "  GPU Range: $BASE_GPU_OFFSET-$LAST_GPU"
Yan Ru Pei's avatar
Yan Ru Pei committed
160
161
162
163
164
165
166
167
168
169
170
171
172
173
echo "  Engine args: ${EXTRA_ARGS[*]}"
echo ""

PIDS=()

cleanup() {
    echo -e "\nStopping all workers..."
    kill "${PIDS[@]}" 2>/dev/null
    wait
    exit 0
}

trap cleanup SIGINT SIGTERM

174
echo "Starting $NUM_WORKERS $MODE workers..."
Yan Ru Pei's avatar
Yan Ru Pei committed
175

176
177
178
179
180
if [ "$USE_MOCKERS" = true ]; then
    # For mockers, launch a single process with --num-workers
    # All workers share the same tokio runtime and thread pool
    MODE_CAPITALIZED=$(echo "$MODE" | sed 's/\(.\)/\U\1/')
    echo "[$MODE_CAPITALIZED Mocker] Starting $NUM_WORKERS workers in single process..."
Yan Ru Pei's avatar
Yan Ru Pei committed
181

182
183
184
    MOCKER_ARGS=()
    MOCKER_ARGS+=("--model-path" "$MODEL_PATH")
    MOCKER_ARGS+=("--num-workers" "$NUM_WORKERS")
Yan Ru Pei's avatar
Yan Ru Pei committed
185

186
187
188
189
190
191
192
193
194
195
    # Set endpoint based on worker mode
    if [ "$MODE" = "prefill" ]; then
        MOCKER_ARGS+=("--endpoint" "dyn://test.prefill.generate")
        MOCKER_ARGS+=("--is-prefill-worker")
    elif [ "$MODE" = "decode" ]; then
        MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
        MOCKER_ARGS+=("--is-decode-worker")
    else
        MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
    fi
Yan Ru Pei's avatar
Yan Ru Pei committed
196

197
198
199
    if [ "$DATA_PARALLEL_SIZE" -gt 1 ]; then
        MOCKER_ARGS+=("--data-parallel-size" "$DATA_PARALLEL_SIZE")
    fi
Yan Ru Pei's avatar
Yan Ru Pei committed
200
201
202
    if [ -n "$REASONING" ]; then
        MOCKER_ARGS+=("--reasoning" "$REASONING")
    fi
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
    MOCKER_ARGS+=("${EXTRA_ARGS[@]}")

    python -m dynamo.mocker "${MOCKER_ARGS[@]}" &
    PIDS+=($!)
    echo "Started mocker with $NUM_WORKERS workers (PID: $!)"
else
    # For vLLM and TensorRT-LLM, use the original loop to launch separate processes
    for i in $(seq 1 $NUM_WORKERS); do
        {
            MODE_CAPITALIZED=$(echo "$MODE" | sed 's/\(.\)/\U\1/')
            echo "[$MODE_CAPITALIZED Worker-$i] Starting..."

            # Calculate GPU indices for this worker (with base offset)
            # Each worker needs TP * DP GPUs
            START_GPU=$(( BASE_GPU_OFFSET + (i - 1) * GPUS_PER_WORKER ))
            END_GPU=$(( START_GPU + GPUS_PER_WORKER - 1 ))
Yan Ru Pei's avatar
Yan Ru Pei committed
219

220
221
222
            # Build CUDA_VISIBLE_DEVICES string for all GPUs (TP * DP)
            if [ "$GPUS_PER_WORKER" -eq 1 ]; then
                GPU_DEVICES="$START_GPU"
Yan Ru Pei's avatar
Yan Ru Pei committed
223
            else
224
225
226
227
228
229
230
231
                GPU_DEVICES=""
                for gpu in $(seq $START_GPU $END_GPU); do
                    if [ -n "$GPU_DEVICES" ]; then
                        GPU_DEVICES="${GPU_DEVICES},$gpu"
                    else
                        GPU_DEVICES="$gpu"
                    fi
                done
Yan Ru Pei's avatar
Yan Ru Pei committed
232
233
            fi

234
235
            if [ "$USE_TRTLLM" = true ]; then
                echo "[$MODE_CAPITALIZED Worker-$i] Using GPUs: $GPU_DEVICES"
236
                # Run TensorRT-LLM engine
237
238
239
240
241
242
243
                TRTLLM_ARGS=()
                TRTLLM_ARGS+=("--model-path" "$MODEL_PATH")
                TRTLLM_ARGS+=("--tensor-parallel-size" "$TENSOR_PARALLEL_SIZE")
                if [ "$MODE" != "agg" ]; then
                    TRTLLM_ARGS+=("--disaggregation-mode" "$MODE")
                fi
                TRTLLM_ARGS+=("${EXTRA_ARGS[@]}")
Yan Ru Pei's avatar
Yan Ru Pei committed
244

245
                exec env CUDA_VISIBLE_DEVICES=$GPU_DEVICES trtllm-llmapi-launch python3 -m dynamo.trtllm \
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
                    "${TRTLLM_ARGS[@]}"
            else
                echo "[$MODE_CAPITALIZED Worker-$i] Using GPUs: $GPU_DEVICES"
                # Run vLLM engine with PYTHONHASHSEED=0 for deterministic event IDs in KV-aware routing
                VLLM_ARGS=()
                VLLM_ARGS+=("--model" "$MODEL_PATH")
                VLLM_ARGS+=("--tensor-parallel-size" "$TENSOR_PARALLEL_SIZE")
                if [ "$DATA_PARALLEL_SIZE" -gt 1 ]; then
                    VLLM_ARGS+=("--data-parallel-size" "$DATA_PARALLEL_SIZE")
                fi
                if [ "$MODE" = "prefill" ]; then
                    VLLM_ARGS+=("--is-prefill-worker")
                elif [ "$MODE" = "decode" ]; then
                    VLLM_ARGS+=("--is-decode-worker")
                fi
                VLLM_ARGS+=("${EXTRA_ARGS[@]}")
262

263
264
                VLLM_ARGS+=("--kv-events-config" "{\"publisher\":\"zmq\",\"topic\":\"kv-events\",\"endpoint\":\"tcp://*:$((20080 + i))\",\"enable_kv_cache_events\":true}")
                exec env PYTHONHASHSEED=0 CUDA_VISIBLE_DEVICES=$GPU_DEVICES VLLM_NIXL_SIDE_CHANNEL_PORT=$((20096 + i)) python3 -m dynamo.vllm \
265
                    "${VLLM_ARGS[@]}"
Yan Ru Pei's avatar
Yan Ru Pei committed
266
            fi
267
268
269
        } &
        PIDS+=($!)
        echo "Started $MODE worker $i (PID: $!)"
270
271
272
273
274
275

        # Add delay between TensorRT-LLM worker launches to avoid MPI initialization conflicts
        if [ "$USE_TRTLLM" = true ] && [ "$i" -lt "$NUM_WORKERS" ]; then
            echo "Waiting 2 seconds before launching next TensorRT-LLM worker..."
            sleep 2
        fi
276
277
    done
fi
Yan Ru Pei's avatar
Yan Ru Pei committed
278
279
280

echo "All workers started. Press Ctrl+C to stop."
wait
281
echo "All workers completed."