#!/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 if [[ -z ${MODEL_PATH} ]]; then echo "ERROR: MODEL_PATH was not set." echo "ERROR: MODEL_PATH must be set to either the HuggingFace ID or locally " \ "downloaded path to the model weights. Since Deepseek R1 is large, it is " \ "recommended to pre-download them to a shared location and provide the path." exit 1 fi if [[ -z ${SERVED_MODEL_NAME} ]]; then echo "ERROR: SERVED_MODEL_NAME was not set." exit 1 fi IMAGE="${IMAGE:-""}" # For GB200, we use 4 tasks per node. NTASKS_PER_NODE="${NTASKS_PER_NODE:-4}" ISL="${ISL:-8150}" OSL="${OSL:-1024}" # Build slurm_args step-by-step with validation and defaults slurm_args="--time=04:00:00" # Add partition if set if [[ -n "${SLURM_PARTITION:-}" ]]; then slurm_args="${slurm_args} --partition=${SLURM_PARTITION}" fi # Add account if set if [[ -n "${SLURM_ACCOUNT:-}" ]]; then slurm_args="${slurm_args} --account=${SLURM_ACCOUNT}" fi # Add job name with sensible default if [[ -n "${SLURM_JOB_NAME:-}" ]]; then slurm_args="${slurm_args} --job-name=${SLURM_JOB_NAME}" fi # Usage Instructions usage() { echo "Usage: $0 [ctx_num] [gen_num] [gen_tp_size] [gen_batch_size] [gen_max_num_tokens] [gen_gpu_memory_fraction] [gen_eplb_num_slots] [gen_mtp_size] [gen_concurrency_list]" echo "" echo "MTP Modes:" echo " mtp=off - Run without Multi-Token Prediction (gen_mtp_size=0)" echo " mtp=on - Run with Multi-Token Prediction (gen_mtp_size=1,2,3)" echo "" echo "Execution Modes:" echo " all - Run all predefined GPU configurations (4, 8, 16, 32 GPUs)" echo " tep - Run Tensor-Expert Parallel mode (attention_dp=false)" echo " dep - Run Data-Expert Parallel mode (attention_dp=true)" echo " 4GPU, 8GPU, 16GPU, 32GPU - Run specific GPU configurations" echo "" echo "Parameters for tep/dep modes:" echo " ctx_num: Number of context nodes" echo " gen_num: Number of generation nodes" echo " gen_tp_size: Generation tensor parallel size" echo " gen_batch_size: Generation batch size" echo " gen_max_num_tokens: Generation max number of tokens" echo " gen_gpu_memory_fraction: GPU memory fraction (0.7-0.95)" echo " gen_mtp_size: Multi-Token Prediction size (0 for mtp=off, 1-3 for mtp=on)" echo " gen_eplb_num_slots: Expert load balancing slots (0, 256, 288)" echo " gen_concurrency_list: Concurrency values (space-separated, quoted)" echo "" echo "Examples:" echo " $0 mtp=off all # Run all MTP0 predefined combinations" echo " $0 mtp=on all # Run all MTP predefined combinations" echo " $0 mtp=off tep 1 3 4 128 128 0.9 0 0 \"1 2 4 8\" # Run MTP0 TEP with specific config" echo " $0 mtp=on dep 2 3 8 256 256 0.8 0 256 \"256 512 1024\" # Run MTP DEP with specific config" exit 1 } # Run single task run_single() { local ctx_num=$1 local gen_num=$2 local gen_tp_size=$3 local gen_batch_size=$4 local gen_max_num_tokens=$5 local gen_enable_attention_dp=$6 local gen_gpu_memory_fraction=$7 local gen_mtp_size=$8 local gen_eplb_num_slots=$9 local gen_concurrency_list=${10} # TODO: expose kind to the command line local kind="dynamo_disagg" gen_nodes=$(((gen_tp_size + 3)/4 * gen_num)) total_nodes=$((ctx_num + gen_nodes)) total_tasks=$((total_nodes * 4)) set -x sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} set +x } # MTP0 Configuration (gen_mtp_size=0) run_4_gpus_mtp0() { echo "Running 4 GPUs MTP0 combinations..." run_single 1 4 4 16 16 false "0.9" 0 0 "1 2 4 8 16 24 " run_single 1 3 4 32 32 false "0.9" 0 0 "32 48" run_single 1 2 4 64 64 false "0.9" 0 0 "64 96" run_single 2 3 4 128 128 false "0.9" 0 0 "128 192" run_single 3 2 4 64 64 true "0.8" 0 0 "256 384" run_single 2 1 4 128 128 true "0.8" 0 0 "512 768" } run_8_gpus_mtp0() { echo "Running 8 GPUs MTP0 combinations..." run_single 1 4 8 16 16 false "0.9" 0 0 "1 2 4 8 16 24" run_single 1 2 8 32 32 false "0.9" 0 0 "32 48" run_single 2 3 8 64 64 false "0.9" 0 0 "64 96" run_single 1 1 8 128 128 false "0.9" 0 0 "128 192" run_single 3 2 8 32 32 true "0.8" 0 0 "256 384" run_single 3 1 8 64 64 true "0.8" 0 0 "512 768" run_single 4 1 8 128 128 true "0.8" 0 0 "1024 1536" run_single 6 1 8 256 256 true "0.8" 0 0 "2048 3072" } run_16_gpus_mtp0() { echo "Running 16 GPUs MTP0 combinations..." run_single 1 1 16 8 8 true "0.8" 0 0 "16 32 64 128 192" # 5 run_single 2 1 16 16 16 true "0.8" 0 0 "256 384" # 6 run_single 4 1 16 32 32 true "0.8" 0 0 "512 768" # 8 run_single 6 1 16 64 64 true "0.8" 0 0 "1024 1536" # 10 run_single 9 1 16 128 128 true "0.8" 0 0 "2048 3072" # 13 run_single 12 1 16 256 256 true "0.8" 0 288 "4096 6144" # 16 } run_32_gpus_mtp0() { echo "Running 32 GPUs MTP0 combinations..." run_single 1 1 32 4 4 true "0.7" 0 0 "32 64 128 192" # 9 run_single 2 1 32 8 8 true "0.7" 0 0 "256 384" # 10 run_single 4 1 32 16 16 true "0.7" 0 0 "512 768" # 12 run_single 7 1 32 32 32 true "0.7" 0 0 "1024 1536" # 15 } # MTP Configuration (gen_mtp_size=1,2,3) run_4_gpus_mtp() { echo "Running 4 GPUs MTP combinations..." run_single 1 4 4 8 32 false "0.9" 3 0 "1 2 4 8 12" run_single 1 3 4 16 64 false "0.9" 3 0 "16 24" run_single 1 2 4 32 128 false "0.9" 3 0 "32 48" run_single 2 3 4 16 64 true "0.8" 3 0 "64 96" run_single 1 1 4 32 128 true "0.8" 3 0 "128 192" run_single 2 1 4 64 256 true "0.8" 2 0 "256 384" run_single 5 2 4 128 512 true "0.8" 1 0 "512 768" } run_8_gpus_mtp() { echo "Running 8 GPUs MTP combinations..." run_single 1 4 8 8 32 false "0.9" 3 0 "1 2 4 8 12" run_single 1 2 8 16 64 false "0.9" 3 0 "16 24" run_single 1 1 8 32 128 false "0.9" 3 0 "32 48" run_single 1 1 8 8 32 true "0.8" 3 0 "64 96" run_single 3 2 8 16 64 true "0.8" 3 0 "128 192" run_single 5 2 8 32 128 true "0.8" 3 0 "256 384" run_single 4 1 8 64 256 true "0.8" 2 0 "512 768" run_single 6 1 8 128 256 true "0.8" 1 0 "1024 1536" run_single 7 1 8 256 512 true "0.8" 1 0 "2048 3072" } run_16_gpus_mtp() { echo "Running 16 GPUs MTP combinations..." run_single 1 1 16 4 16 true "0.8" 3 0 "16 32 64 96" # 5 run_single 2 1 16 8 32 true "0.8" 3 0 "128 192" # 6 run_single 4 1 16 16 64 true "0.8" 3 0 "256 384" # 8 run_single 6 1 16 32 128 true "0.8" 3 0 "512 768" # 10 run_single 9 1 16 64 256 true "0.8" 2 256 "1024 1536" # 13 run_single 11 1 16 128 256 true "0.8" 1 288 "2048 3072" # 15 } run_32_gpus_mtp() { echo "Running 32 GPUs MTP combinations..." run_single 1 1 32 16 64 true "0.7" 3 0 "32 48" # 9 run_single 2 1 32 16 64 true "0.7" 3 0 "64 96" # 10 run_single 3 1 32 4 16 true "0.7" 3 0 "128 192" # 11 run_single 5 1 32 8 32 true "0.7" 3 0 "256 384" # 13 run_single 8 1 32 16 64 true "0.7" 3 288 "512 768" # 16 } # Main function main() { local mtp_mode=$1 local mode=$2 # Validate MTP mode if [[ "$mtp_mode" != "mtp=off" && "$mtp_mode" != "mtp=on" ]]; then echo "Error: Invalid MTP mode '$mtp_mode'. Must be 'mtp=off' or 'mtp=on'" usage fi case $mode in "all") echo "Running all GPU configurations for $mtp_mode mode..." if [[ "$mtp_mode" == "mtp=off" ]]; then run_4_gpus_mtp0 run_8_gpus_mtp0 run_16_gpus_mtp0 run_32_gpus_mtp0 else run_4_gpus_mtp run_8_gpus_mtp run_16_gpus_mtp run_32_gpus_mtp fi ;; "4GPU") echo "Running 4 GPUs combinations for $mtp_mode mode..." if [[ "$mtp_mode" == "mtp=off" ]]; then run_4_gpus_mtp0 else run_4_gpus_mtp fi ;; "8GPU") echo "Running 8 GPUs combinations for $mtp_mode mode..." if [[ "$mtp_mode" == "mtp=off" ]]; then run_8_gpus_mtp0 else run_8_gpus_mtp fi ;; "16GPU") echo "Running 16 GPUs combinations for $mtp_mode mode..." if [[ "$mtp_mode" == "mtp=off" ]]; then run_16_gpus_mtp0 else run_16_gpus_mtp fi ;; "32GPU") echo "Running 32 GPUs combinations for $mtp_mode mode..." if [[ "$mtp_mode" == "mtp=off" ]]; then run_32_gpus_mtp0 else run_32_gpus_mtp fi ;; "tep") if [ $# -ne 11 ]; then echo "Error: TEP mode requires 11 additional parameters (including mtp_mode)" usage fi local ctx_num=$3 local gen_num=$4 local gen_tp_size=$5 local gen_batch_size=$6 local gen_max_num_tokens=$7 local gen_gpu_memory_fraction=$8 local gen_mtp_size=$9 local gen_eplb_num_slots=${10} local gen_concurrency_list=${11} echo "Running TEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" # TEP mode: Use false to disable attention dp run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens false $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" ;; "dep") if [ $# -ne 11 ]; then echo "Error: DEP mode requires 11 additional parameters (including mtp_mode)" usage fi local ctx_num=$3 local gen_num=$4 local gen_tp_size=$5 local gen_batch_size=$6 local gen_max_num_tokens=$7 local gen_gpu_memory_fraction=$8 local gen_mtp_size=$9 local gen_eplb_num_slots=${10} local gen_concurrency_list=${11} echo "Running DEP mode ($mtp_mode) with ctx_num=$ctx_num, gen_num=$gen_num, gen_tp_size=$gen_tp_size, gen_batch_size=$gen_batch_size, gen_max_num_tokens=$gen_max_num_tokens, gen_gpu_memory_fraction=$gen_gpu_memory_fraction, gen_mtp_size=$gen_mtp_size, gen_eplb_num_slots=$gen_eplb_num_slots, gen_concurrency_list=\"$gen_concurrency_list\"" run_single $ctx_num $gen_num $gen_tp_size $gen_batch_size $gen_max_num_tokens true $gen_gpu_memory_fraction $gen_mtp_size $gen_eplb_num_slots "$gen_concurrency_list" ;; *) echo "Error: Unknown mode '$mode'" usage ;; esac } # Check parameters if [ $# -eq 0 ]; then usage fi # Run main function main "$@"