#!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 GPUS=$3 PY_ARGS=${@:4} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} SRUN_ARGS=${SRUN_ARGS:-""} PORT=$(( ( RANDOM % 10000 ) + 10000 )) srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u train.py --launcher slurm ${PY_ARGS} --tcp_port $PORT