submit.sh 11.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

if [[ -z ${MODEL_PATH} ]]; then
    echo "ERROR: MODEL_PATH was not set."
    echo "ERROR: MODEL_PATH must be set to either the HuggingFace ID or locally " \
         "downloaded path to the model weights. Since Deepseek R1 is large, it is " \
         "recommended to pre-download them to a shared location and provide the path."
    exit 1
fi

if [[ -z ${SERVED_MODEL_NAME} ]]; then
    echo "ERROR: SERVED_MODEL_NAME was not set."
    exit 1
fi

IMAGE="${IMAGE:-""}"

# For GB200, we use 4 tasks per node.
NTASKS_PER_NODE="${NTASKS_PER_NODE:-4}"

ISL="${ISL:-8150}"
OSL="${OSL:-1024}"

# Build slurm_args step-by-step with validation and defaults
slurm_args="--time=04:00:00"

# Add partition if set
if [[ -n "${SLURM_PARTITION:-}" ]]; then
    slurm_args="${slurm_args} --partition=${SLURM_PARTITION}"
fi

# Add account if set
if [[ -n "${SLURM_ACCOUNT:-}" ]]; then
    slurm_args="${slurm_args} --account=${SLURM_ACCOUNT}"
fi

# Add job name with sensible default
if [[ -n "${SLURM_JOB_NAME:-}" ]]; then
    slurm_args="${slurm_args} --job-name=${SLURM_JOB_NAME}"
fi

# Usage Instructions
usage() {
    echo "Usage: $0 <mtp_mode> <mode> [ctx_num] [gen_num] [gen_tp_size] [gen_batch_size] [gen_max_num_tokens] [gen_gpu_memory_fraction] [gen_eplb_num_slots] [gen_mtp_size] [gen_concurrency_list]"
    echo ""
    echo "MTP Modes:"
    echo "  mtp=off - Run without Multi-Token Prediction (gen_mtp_size=0)"
    echo "  mtp=on  - Run with Multi-Token Prediction (gen_mtp_size=1,2,3)"
    echo ""
    echo "Execution Modes:"
    echo "  all - Run all predefined GPU configurations (4, 8, 16, 32 GPUs)"
    echo "  tep - Run Tensor-Expert Parallel mode (attention_dp=false)"
    echo "  dep - Run Data-Expert Parallel mode (attention_dp=true)"
    echo "  4GPU, 8GPU, 16GPU, 32GPU - Run specific GPU configurations"
    echo ""
    echo "Parameters for tep/dep modes:"
    echo "  ctx_num: Number of context nodes"
    echo "  gen_num: Number of generation nodes"
    echo "  gen_tp_size: Generation tensor parallel size"
    echo "  gen_batch_size: Generation batch size"
    echo "  gen_max_num_tokens: Generation max number of tokens"
    echo "  gen_gpu_memory_fraction: GPU memory fraction (0.7-0.95)"
    echo "  gen_mtp_size: Multi-Token Prediction size (0 for mtp=off, 1-3 for mtp=on)"
    echo "  gen_eplb_num_slots: Expert load balancing slots (0, 256, 288)"
    echo "  gen_concurrency_list: Concurrency values (space-separated, quoted)"
    echo ""
    echo "Examples:"
    echo "  $0 mtp=off all                                    # Run all MTP0 predefined combinations"
    echo "  $0 mtp=on all                                     # Run all MTP predefined combinations"
    echo "  $0 mtp=off tep 1 3 4 128 128 0.9 0 0 \"1 2 4 8\" # Run MTP0 TEP with specific config"
    echo "  $0 mtp=on dep 2 3 8 256 256 0.8 0 256 \"256 512 1024\" # Run MTP DEP with specific config"
    exit 1
}

# Run single task
run_single() {
    local ctx_num=$1
    local gen_num=$2
    local gen_tp_size=$3
    local gen_batch_size=$4
    local gen_max_num_tokens=$5
    local gen_enable_attention_dp=$6
    local gen_gpu_memory_fraction=$7
    local gen_mtp_size=$8
    local gen_eplb_num_slots=$9
    local gen_concurrency_list=${10}

    # TODO: expose kind to the command line
    local kind="dynamo_disagg"

    gen_nodes=$(((gen_tp_size + 3)/4 * gen_num))
    total_nodes=$((ctx_num + gen_nodes))
    total_tasks=$((total_nodes * 4))
    set -x
    sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
    set +x
}

# MTP0 Configuration (gen_mtp_size=0)
run_4_gpus_mtp0() {
    echo "Running 4 GPUs MTP0 combinations..."
    run_single 1 4 4 16 16 false "0.9" 0 0 "1 2 4 8 16 24 "
    run_single 1 3 4 32 32 false "0.9" 0 0 "32 48"
    run_single 1 2 4 64 64 false "0.9" 0 0 "64 96"
    run_single 2 3 4 128 128 false "0.9" 0 0 "128 192"
    run_single 3 2 4 64 64 true "0.8" 0 0 "256 384"
    run_single 2 1 4 128 128 true "0.8" 0 0 "512 768"
}

run_8_gpus_mtp0() {
    echo "Running 8 GPUs MTP0 combinations..."
    run_single 1 4 8 16 16 false "0.9" 0 0 "1 2 4 8 16 24"
    run_single 1 2 8 32 32 false "0.9" 0 0 "32 48"
    run_single 2 3 8 64 64 false "0.9" 0 0 "64 96"
    run_single 1 1 8 128 128 false "0.9" 0 0 "128 192"
    run_single 3 2 8 32 32 true "0.8" 0 0 "256 384"
    run_single 3 1 8 64 64 true "0.8" 0 0 "512 768"
    run_single 4 1 8 128 128 true "0.8" 0 0 "1024 1536"
    run_single 6 1 8 256 256 true "0.8" 0 0 "2048 3072"
}

run_16_gpus_mtp0() {
    echo "Running 16 GPUs MTP0 combinations..."
    run_single 1 1 16 8 8 true "0.8" 0 0 "16 32 64 128 192" # 5
    run_single 2 1 16 16 16 true "0.8" 0 0 "256 384"        # 6
    run_single 4 1 16 32 32 true "0.8" 0 0 "512 768"        # 8
    run_single 6 1 16 64 64 true "0.8" 0 0 "1024 1536"      # 10
    run_single 9 1 16 128 128 true "0.8" 0 0 "2048 3072"    # 13
    run_single 12 1 16 256 256 true "0.8" 0 288 "4096 6144" # 16
}

run_32_gpus_mtp0() {
    echo "Running 32 GPUs MTP0 combinations..."
    run_single 1 1 32 4 4 true "0.7" 0 0 "32 64 128 192"   # 9
    run_single 2 1 32 8 8 true "0.7" 0 0 "256 384"         # 10
    run_single 4 1 32 16 16 true "0.7" 0 0 "512 768"       # 12
    run_single 7 1 32 32 32 true "0.7" 0 0 "1024 1536"     # 15
}

# MTP Configuration (gen_mtp_size=1,2,3)
run_4_gpus_mtp() {
    echo "Running 4 GPUs MTP combinations..."
    run_single 1 4 4 8 32 false "0.9" 3 0 "1 2 4 8 12"
    run_single 1 3 4 16 64 false "0.9" 3 0 "16 24"
    run_single 1 2 4 32 128 false "0.9" 3 0 "32 48"
    run_single 2 3 4 16 64 true "0.8" 3 0 "64 96"
    run_single 1 1 4 32 128 true "0.8" 3 0 "128 192"
    run_single 2 1 4 64 256 true "0.8" 2 0 "256 384"
    run_single 5 2 4 128 512 true "0.8" 1 0 "512 768"
}

run_8_gpus_mtp() {
    echo "Running 8 GPUs MTP combinations..."
    run_single 1 4 8 8 32 false "0.9" 3 0 "1 2 4 8 12"
    run_single 1 2 8 16 64 false "0.9" 3 0 "16 24"
    run_single 1 1 8 32 128 false "0.9" 3 0 "32 48"
    run_single 1 1 8 8 32 true "0.8" 3 0 "64 96"
    run_single 3 2 8 16 64 true "0.8" 3 0 "128 192"
    run_single 5 2 8 32 128 true "0.8" 3 0 "256 384"
    run_single 4 1 8 64 256 true "0.8" 2 0 "512 768"
    run_single 6 1 8 128 256 true "0.8" 1 0 "1024 1536"
    run_single 7 1 8 256 512 true "0.8" 1 0 "2048 3072"
}

run_16_gpus_mtp() {
    echo "Running 16 GPUs MTP combinations..."
    run_single 1 1 16 4 16 true "0.8" 3 0 "16 32 64 96" # 5
    run_single 2 1 16 8 32 true "0.8" 3 0 "128 192"       # 6
    run_single 4 1 16 16 64 true "0.8" 3 0 "256 384"      # 8
    run_single 6 1 16 32 128 true "0.8" 3 0 "512 768"    # 10
    run_single 9 1 16 64 256 true "0.8" 2 256 "1024 1536" # 13
    run_single 11 1 16 128 256 true "0.8" 1 288 "2048 3072" # 15
}

run_32_gpus_mtp() {
    echo "Running 32 GPUs MTP combinations..."
    run_single 1 1 32 16 64 true "0.7" 3 0 "32 48" # 9
    run_single 2 1 32 16 64 true "0.7" 3 0 "64 96" # 10
    run_single 3 1 32 4 16 true "0.7" 3 0 "128 192" # 11
    run_single 5 1 32 8 32 true "0.7" 3 0 "256 384" # 13
    run_single 8 1 32 16 64 true "0.7" 3 288 "512 768" # 16
}

# Main function
main() {
    local mtp_mode=$1
    local mode=$2

    # Validate MTP mode
    if [[ "$mtp_mode" != "mtp=off" && "$mtp_mode" != "mtp=on" ]]; then
        echo "Error: Invalid MTP mode '$mtp_mode'. Must be 'mtp=off' or 'mtp=on'"
        usage
    fi

    case $mode in
        "all")
            echo "Running all GPU configurations for $mtp_mode mode..."
            if [[ "$mtp_mode" == "mtp=off" ]]; then
                run_4_gpus_mtp0
                run_8_gpus_mtp0
                run_16_gpus_mtp0
                run_32_gpus_mtp0
            else
                run_4_gpus_mtp
                run_8_gpus_mtp
                run_16_gpus_mtp
                run_32_gpus_mtp
            fi
            ;;
        "4GPU")
            echo "Running 4 GPUs combinations for $mtp_mode mode..."
            if [[ "$mtp_mode" == "mtp=off" ]]; then
                run_4_gpus_mtp0
            else
                run_4_gpus_mtp
            fi
            ;;
        "8GPU")
            echo "Running 8 GPUs combinations for $mtp_mode mode..."
            if [[ "$mtp_mode" == "mtp=off" ]]; then
                run_8_gpus_mtp0
            else
                run_8_gpus_mtp
            fi
            ;;
        "16GPU")
            echo "Running 16 GPUs combinations for $mtp_mode mode..."
            if [[ "$mtp_mode" == "mtp=off" ]]; then
                run_16_gpus_mtp0
            else
                run_16_gpus_mtp
            fi
            ;;
        "32GPU")
            echo "Running 32 GPUs combinations for $mtp_mode mode..."
            if [[ "$mtp_mode" == "mtp=off" ]]; then
                run_32_gpus_mtp0
            else
                run_32_gpus_mtp
            fi
            ;;
        "tep")
            if [ $# -ne 11 ]; then
                echo "Error: TEP mode requires 11 additional parameters (including mtp_mode)"
                usage
            fi

            local ctx_num=$3
            local gen_num=$4
            local gen_tp_size=$5
            local gen_batch_size=$6
            local gen_max_num_tokens=$7
            local gen_gpu_memory_fraction=$8
            local gen_mtp_size=$9
            local gen_eplb_num_slots=${10}
            local gen_concurrency_list=${11}

            echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""

            # TEP mode: Use false to disable attention dp
            run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
            ;;
        "dep")
            if [ $# -ne 11 ]; then
                echo "Error: DEP mode requires 11 additional parameters (including mtp_mode)"
                usage
            fi

            local ctx_num=$3
            local gen_num=$4
            local gen_tp_size=$5
            local gen_batch_size=$6
            local gen_max_num_tokens=$7
            local gen_gpu_memory_fraction=$8
            local gen_mtp_size=$9
            local gen_eplb_num_slots=${10}
            local gen_concurrency_list=${11}

            echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\""

            run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list"
            ;;
        *)
            echo "Error: Unknown mode '$mode'"
            usage
            ;;
    esac
}

# Check parameters
if [ $# -eq 0 ]; then
    usage
fi

# Run main function
main "$@"