run_engines.sh 7.79 KB
Newer Older
Yan Ru Pei's avatar
Yan Ru Pei committed
1
2
3
4
5
6
7
8
9
#!/bin/bash

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Parse command-line arguments
NUM_WORKERS=8
MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
TENSOR_PARALLEL_SIZE=1
Yan Ru Pei's avatar
Yan Ru Pei committed
10
DATA_PARALLEL_SIZE=1
Yan Ru Pei's avatar
Yan Ru Pei committed
11
USE_MOCKERS=false
12
13
USE_TRTLLM=false
MODE="agg"  # Options: agg (default), decode, prefill
Yan Ru Pei's avatar
Yan Ru Pei committed
14
BASE_GPU_OFFSET=0
Yan Ru Pei's avatar
Yan Ru Pei committed
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
EXTRA_ARGS=()

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --num-workers)
            NUM_WORKERS="$2"
            shift 2
            ;;
        --model-path)
            MODEL_PATH="$2"
            shift 2
            ;;
        --tensor-parallel-size)
            TENSOR_PARALLEL_SIZE="$2"
            shift 2
            ;;
Yan Ru Pei's avatar
Yan Ru Pei committed
32
33
34
35
        --data-parallel-size)
            DATA_PARALLEL_SIZE="$2"
            shift 2
            ;;
Yan Ru Pei's avatar
Yan Ru Pei committed
36
37
38
39
        --mockers)
            USE_MOCKERS=true
            shift
            ;;
40
41
42
43
44
45
46
47
48
49
        --trtllm)
            USE_TRTLLM=true
            shift
            ;;
        --prefill)
            MODE="prefill"
            shift
            ;;
        --decode)
            MODE="decode"
Yan Ru Pei's avatar
Yan Ru Pei committed
50
51
52
53
54
55
            shift
            ;;
        --base-gpu-offset)
            BASE_GPU_OFFSET="$2"
            shift 2
            ;;
Yan Ru Pei's avatar
Yan Ru Pei committed
56
57
58
59
60
61
        --)
            shift
            EXTRA_ARGS+=("$@")
            break
            ;;
        *)
62
            # Collect all other arguments as vLLM/mocker/trtllm arguments
Yan Ru Pei's avatar
Yan Ru Pei committed
63
64
65
66
67
68
            EXTRA_ARGS+=("$1")
            shift
            ;;
    esac
done

69
70
71
72
73
74
75
76
77
# Validate that only one engine type is selected
ENGINE_COUNT=0
[ "$USE_MOCKERS" = true ] && ((ENGINE_COUNT++))
[ "$USE_TRTLLM" = true ] && ((ENGINE_COUNT++))
if [ "$ENGINE_COUNT" -gt 1 ]; then
    echo "Error: Only one engine type (--mockers, --trtllm, or default vLLM) can be specified"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
78
79
80
81
82
83
84
# If no extra args provided, use defaults
if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then
    if [ "$USE_MOCKERS" = true ]; then
        # Default args for mocker engine (only block-size needed as others are defaults)
        EXTRA_ARGS=(
            "--block-size" "64"
        )
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
    elif [ "$USE_TRTLLM" = true ]; then
        # Default args for TensorRT-LLM engine using predefined YAML configs
        # Config files located at: ../../components/backends/trtllm/engine_configs/{agg,decode,prefill}.yaml
        if [ "$MODE" = "prefill" ]; then
            ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/prefill.yaml"
        elif [ "$MODE" = "decode" ]; then
            ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/decode.yaml"
        else
            ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/agg.yaml"
        fi

        EXTRA_ARGS=(
            "--extra-engine-args" "$ENGINE_CONFIG"
            "--publish-events-and-metrics"
        )
Yan Ru Pei's avatar
Yan Ru Pei committed
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
    else
        # Default args for vLLM engine (explicitly include block-size)
        EXTRA_ARGS=(
            "--enforce-eager"
            "--max-num-batched-tokens" "16384"
            "--max-model-len" "32768"
            "--block-size" "64"
        )
    fi
fi

# Validate arguments
if ! [[ "$NUM_WORKERS" =~ ^[0-9]+$ ]] || [ "$NUM_WORKERS" -lt 1 ]; then
    echo "Error: NUM_WORKERS must be a positive integer"
    exit 1
fi

if ! [[ "$TENSOR_PARALLEL_SIZE" =~ ^[0-9]+$ ]] || [ "$TENSOR_PARALLEL_SIZE" -lt 1 ]; then
    echo "Error: TENSOR_PARALLEL_SIZE must be a positive integer"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
122
123
124
125
126
if ! [[ "$DATA_PARALLEL_SIZE" =~ ^[0-9]+$ ]] || [ "$DATA_PARALLEL_SIZE" -lt 1 ]; then
    echo "Error: DATA_PARALLEL_SIZE must be a positive integer"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
127
128
129
130
131
if ! [[ "$BASE_GPU_OFFSET" =~ ^[0-9]+$ ]]; then
    echo "Error: BASE_GPU_OFFSET must be a non-negative integer"
    exit 1
fi

Yan Ru Pei's avatar
Yan Ru Pei committed
132
133
134
# Calculate total GPUs needed (TP * DP per worker)
GPUS_PER_WORKER=$((TENSOR_PARALLEL_SIZE * DATA_PARALLEL_SIZE))
TOTAL_GPUS_NEEDED=$((NUM_WORKERS * GPUS_PER_WORKER))
Yan Ru Pei's avatar
Yan Ru Pei committed
135
LAST_GPU=$((BASE_GPU_OFFSET + TOTAL_GPUS_NEEDED - 1))
Yan Ru Pei's avatar
Yan Ru Pei committed
136
echo "Configuration:"
137
138
139
140
141
142
143
144
145
if [ "$USE_MOCKERS" = true ]; then
    ENGINE_TYPE="Mocker"
elif [ "$USE_TRTLLM" = true ]; then
    ENGINE_TYPE="TensorRT-LLM"
else
    ENGINE_TYPE="vLLM"
fi
echo "  Engine Type: $ENGINE_TYPE"
echo "  Mode: $MODE"
Yan Ru Pei's avatar
Yan Ru Pei committed
146
147
148
echo "  Workers: $NUM_WORKERS"
echo "  Model: $MODEL_PATH"
echo "  Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
Yan Ru Pei's avatar
Yan Ru Pei committed
149
150
echo "  Data Parallel Size: $DATA_PARALLEL_SIZE"
echo "  GPUs per worker: $GPUS_PER_WORKER"
Yan Ru Pei's avatar
Yan Ru Pei committed
151
echo "  Total GPUs needed: $TOTAL_GPUS_NEEDED"
Yan Ru Pei's avatar
Yan Ru Pei committed
152
echo "  GPU Range: $BASE_GPU_OFFSET-$LAST_GPU"
Yan Ru Pei's avatar
Yan Ru Pei committed
153
154
155
156
157
158
159
160
161
162
163
164
165
166
echo "  Engine args: ${EXTRA_ARGS[*]}"
echo ""

PIDS=()

cleanup() {
    echo -e "\nStopping all workers..."
    kill "${PIDS[@]}" 2>/dev/null
    wait
    exit 0
}

trap cleanup SIGINT SIGTERM

167
echo "Starting $NUM_WORKERS $MODE workers..."
Yan Ru Pei's avatar
Yan Ru Pei committed
168
169
170

for i in $(seq 1 $NUM_WORKERS); do
    {
Yan Ru Pei's avatar
Yan Ru Pei committed
171
172
        MODE_CAPITALIZED=$(echo "$MODE" | sed 's/\(.\)/\U\1/')
        echo "[$MODE_CAPITALIZED Worker-$i] Starting..."
Yan Ru Pei's avatar
Yan Ru Pei committed
173

Yan Ru Pei's avatar
Yan Ru Pei committed
174
        # Calculate GPU indices for this worker (with base offset)
Yan Ru Pei's avatar
Yan Ru Pei committed
175
176
177
        # Each worker needs TP * DP GPUs
        START_GPU=$(( BASE_GPU_OFFSET + (i - 1) * GPUS_PER_WORKER ))
        END_GPU=$(( START_GPU + GPUS_PER_WORKER - 1 ))
Yan Ru Pei's avatar
Yan Ru Pei committed
178

Yan Ru Pei's avatar
Yan Ru Pei committed
179
180
        # Build CUDA_VISIBLE_DEVICES string for all GPUs (TP * DP)
        if [ "$GPUS_PER_WORKER" -eq 1 ]; then
Yan Ru Pei's avatar
Yan Ru Pei committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
            GPU_DEVICES="$START_GPU"
        else
            GPU_DEVICES=""
            for gpu in $(seq $START_GPU $END_GPU); do
                if [ -n "$GPU_DEVICES" ]; then
                    GPU_DEVICES="${GPU_DEVICES},$gpu"
                else
                    GPU_DEVICES="$gpu"
                fi
            done
        fi

        if [ "$USE_MOCKERS" = true ]; then
            # Run mocker engine (no GPU assignment needed)
Yan Ru Pei's avatar
Yan Ru Pei committed
195
196
197
198
199
200
            MOCKER_ARGS=()
            MOCKER_ARGS+=("--model-path" "$MODEL_PATH")
            MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
            if [ "$DATA_PARALLEL_SIZE" -gt 1 ]; then
                MOCKER_ARGS+=("--data-parallel-size" "$DATA_PARALLEL_SIZE")
            fi
201
202
203
204
205
            if [ "$MODE" = "prefill" ]; then
                MOCKER_ARGS+=("--is-prefill-worker")
            elif [ "$MODE" = "decode" ]; then
                MOCKER_ARGS+=("--is-decode-worker")
            fi
Yan Ru Pei's avatar
Yan Ru Pei committed
206
207
208
            MOCKER_ARGS+=("${EXTRA_ARGS[@]}")

            exec python -m dynamo.mocker "${MOCKER_ARGS[@]}"
209
        elif [ "$USE_TRTLLM" = true ]; then
Yan Ru Pei's avatar
Yan Ru Pei committed
210
            echo "[$MODE_CAPITALIZED Worker-$i] Using GPUs: $GPU_DEVICES"
211
212
213
214
215
216
217
218
219
220
221
            # Run TensorRT-LLM engine with trtllm-llmapi-launch for proper initialization
            TRTLLM_ARGS=()
            TRTLLM_ARGS+=("--model-path" "$MODEL_PATH")
            TRTLLM_ARGS+=("--tensor-parallel-size" "$TENSOR_PARALLEL_SIZE")
            if [ "$MODE" != "agg" ]; then
                TRTLLM_ARGS+=("--disaggregation-mode" "$MODE")
            fi
            TRTLLM_ARGS+=("${EXTRA_ARGS[@]}")

            exec env CUDA_VISIBLE_DEVICES=$GPU_DEVICES trtllm-llmapi-launch python -m dynamo.trtllm \
                "${TRTLLM_ARGS[@]}"
Yan Ru Pei's avatar
Yan Ru Pei committed
222
        else
Yan Ru Pei's avatar
Yan Ru Pei committed
223
            echo "[$MODE_CAPITALIZED Worker-$i] Using GPUs: $GPU_DEVICES"
224
            # Run vLLM engine with PYTHONHASHSEED=0 for deterministic event IDs in KV-aware routing
Yan Ru Pei's avatar
Yan Ru Pei committed
225
226
227
            VLLM_ARGS=()
            VLLM_ARGS+=("--model" "$MODEL_PATH")
            VLLM_ARGS+=("--tensor-parallel-size" "$TENSOR_PARALLEL_SIZE")
Yan Ru Pei's avatar
Yan Ru Pei committed
228
229
230
            if [ "$DATA_PARALLEL_SIZE" -gt 1 ]; then
                VLLM_ARGS+=("--data-parallel-size" "$DATA_PARALLEL_SIZE")
            fi
231
            if [ "$MODE" = "prefill" ]; then
Yan Ru Pei's avatar
Yan Ru Pei committed
232
                VLLM_ARGS+=("--is-prefill-worker")
233
234
            elif [ "$MODE" = "decode" ]; then
                VLLM_ARGS+=("--is-decode-worker")
Yan Ru Pei's avatar
Yan Ru Pei committed
235
236
237
            fi
            VLLM_ARGS+=("${EXTRA_ARGS[@]}")

238
            exec env PYTHONHASHSEED=0 CUDA_VISIBLE_DEVICES=$GPU_DEVICES python -m dynamo.vllm \
Yan Ru Pei's avatar
Yan Ru Pei committed
239
                "${VLLM_ARGS[@]}"
Yan Ru Pei's avatar
Yan Ru Pei committed
240
241
242
        fi
    } &
    PIDS+=($!)
243
    echo "Started $MODE worker $i (PID: $!)"
Yan Ru Pei's avatar
Yan Ru Pei committed
244
245
246
247
248
done

echo "All workers started. Press Ctrl+C to stop."
wait
echo "All workers completed."