#!/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 if [[ -z ${MODEL_PATH} ]]; then echo "ERROR: MODEL_PATH was not set." echo "ERROR: MODEL_PATH must be set to either the HuggingFace ID or locally " \ "downloaded path to the model weights. Since Deepseek R1 is large, it is " \ "recommended to pre-download them to a shared location and provide the path." exit 1 fi if [[ -z ${SERVED_MODEL_NAME} ]]; then echo "ERROR: SERVED_MODEL_NAME was not set." exit 1 fi IMAGE="${IMAGE:-""}" ISL="${ISL:-8150}" OSL="${OSL:-1024}" # For GB200, we use 4 tasks per node. NTASKS_PER_NODE="${NTASKS_PER_NODE:-4}" kind='dynamo_agg' common_args="${kind} ${ISL} ${OSL} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE}" # Build slurm_args step-by-step with validation and defaults slurm_args="--time=04:00:00" # Add partition if set if [[ -n "${SLURM_PARTITION:-}" ]]; then slurm_args="${slurm_args} --partition=${SLURM_PARTITION}" fi # Add account if set if [[ -n "${SLURM_ACCOUNT:-}" ]]; then slurm_args="${slurm_args} --account=${SLURM_ACCOUNT}" fi # Add job name if set if [[ -n "${SLURM_JOB_NAME:-}" ]]; then slurm_args="${slurm_args} --job-name=${SLURM_JOB_NAME}" fi # tep4 max_batch=1024 tp_size=4 ep_size=${tp_size} enable_attention_dp=false mtp=0 nodes_count=$(( (tp_size + NTASKS_PER_NODE - 1) / NTASKS_PER_NODE )) concurrency_list="1 2 4 8 16 32 64 128 256 512 1024 2048" max_num_tokens=$(( ((mtp+1)*max_batch+ISL+128+63)/64*64 )) sbatch --nodes=${nodes_count} --ntasks=${tp_size} --ntasks-per-node=${NTASKS_PER_NODE} ${slurm_args} benchmark_agg.slurm ${tp_size} ${ep_size} ${max_batch} ${max_num_tokens} ${enable_attention_dp} "${concurrency_list}" ${mtp} ${common_args} # dep4 max_batch=1024 tp_size=4 ep_size=${tp_size} enable_attention_dp=true mtp=0 nodes_count=$((tp_size/NTASKS_PER_NODE)) concurrency_list="32 64 128 256 512 1024" max_num_tokens=$(( ((mtp+1)*max_batch+ISL+128+63)/64*64 )) sbatch --nodes=${nodes_count} --ntasks=${tp_size} --ntasks-per-node=${NTASKS_PER_NODE} ${slurm_args} benchmark_agg.slurm ${tp_size} ${ep_size} ${max_batch} ${max_num_tokens} ${enable_attention_dp} "${concurrency_list}" ${mtp} ${common_args} concurrency_list="2048 4096" max_num_tokens=$(( ((mtp+1)*max_batch+ISL+128+63)/64*64 )) sbatch --nodes=${nodes_count} --ntasks=${tp_size} --ntasks-per-node=${NTASKS_PER_NODE} ${slurm_args} benchmark_agg.slurm ${tp_size} ${ep_size} ${max_batch} ${max_num_tokens} ${enable_attention_dp} "${concurrency_list}" ${mtp} ${common_args} # tep8 max_batch=1024 tp_size=8 ep_size=${tp_size} enable_attention_dp=false mtp=0 nodes_count=$((tp_size/NTASKS_PER_NODE)) concurrency_list="1 2 4 8 16 32 64 128 256 512 1024 2048" max_num_tokens=$(( ((mtp+1)*max_batch+ISL+128+63)/64*64 )) sbatch --nodes=${nodes_count} --ntasks=${tp_size} --ntasks-per-node=${NTASKS_PER_NODE} ${slurm_args} benchmark_agg.slurm ${tp_size} ${ep_size} ${max_batch} ${max_num_tokens} ${enable_attention_dp} "${concurrency_list}" ${mtp} ${common_args} # dep8 max_batch=1024 tp_size=8 ep_size=${tp_size} enable_attention_dp=true mtp=0 nodes_count=$((tp_size/NTASKS_PER_NODE)) concurrency_list="32 64 128 256 512 1024" max_num_tokens=$(( ((mtp+1)*max_batch+ISL+128+63)/64*64 )) sbatch --nodes=${nodes_count} --ntasks=${tp_size} --ntasks-per-node=${NTASKS_PER_NODE} ${slurm_args} benchmark_agg.slurm ${tp_size} ${ep_size} ${max_batch} ${max_num_tokens} ${enable_attention_dp} "${concurrency_list}" ${mtp} ${common_args} concurrency_list="2048 4096" max_num_tokens=$(( ((mtp+1)*max_batch+ISL+128+63)/64*64 )) sbatch --nodes=${nodes_count} --ntasks=${tp_size} --ntasks-per-node=${NTASKS_PER_NODE} ${slurm_args} benchmark_agg.slurm ${tp_size} ${ep_size} ${max_batch} ${max_num_tokens} ${enable_attention_dp} "${concurrency_list}" ${mtp} ${common_args} # New: dep8 concurrency greater than 4096 as a separate group concurrency_list="6144 8192" max_num_tokens=$(( ((mtp+1)*max_batch+ISL+128+63)/64*64 )) sbatch --nodes=${nodes_count} --ntasks=${tp_size} --ntasks-per-node=${NTASKS_PER_NODE} ${slurm_args} benchmark_agg.slurm ${tp_size} ${ep_size} ${max_batch} ${max_num_tokens} ${enable_attention_dp} "${concurrency_list}" ${mtp} ${common_args}