#!/bin/bash #SBATCH -p wzhdexclu03 #SBATCH -N 2 #SBATCH --cpus-per-task=1 #SBATCH --ntasks-per-node=32 #SBATCH --gres=dcu:4 #SBATCH -J multi_machine_dcu #SBATCH -o logs/pt-%j.out #SBATCH -e logs/pt-%j.err echo "START TIME: $(date)" hostfile=./hostfile/$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} for i in `cat $hostfile` do echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID done np=$(cat $hostfile|sort|uniq |wc -l) np=$(($np*4)) nodename=$(cat $hostfile |sed -n "1p") dist_url=`echo $nodename | awk '{print $1}'` echo ${dist_url} source ~/miniconda3/etc/profile.d/conda.sh conda activate torch1.10-dtk23.04.1-py38 #conda activate base module purge module load compiler/devtoolset/7.3.1 mpi/hpcx/gcc-7.3.1 compiler/dtk/23.04.1 module list mpirun -np $np --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single_process.sh $dist_url