#!/bin/bash #SBATCH -p caspra #SBATCH -N 2 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=8 #SBATCH --gres=dcu:4 #SBATCH -J syn_nccl_torch #SBATCH -o ./log/output.%j #SBATCH -e ./log/output.%j #SBATCH -x e01r1n07 #for rocm3.3 module rm compiler/rocm/2.9 module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3 export MIOPEN_DEBUG_DISABLE_FIND_DB=1 which mpirun which python3 hostfile=./$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} rm `pwd`/hostfile-dl -f for i in `cat $hostfile` do echo ${i} slots=4 >> `pwd`/hostfile-dl-$SLURM_JOB_ID done np=$(cat $hostfile|sort|uniq |wc -l) np=$(($np*4)) nodename=$(cat $hostfile |sed -n "1p") echo $nodename dist_url=`echo $nodename | awk '{print $1}'` #for single card #echo mpirun -np 1 --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64 #mpirun -np 1 --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64 #for one node #echo mpirun -np $np --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64 #mpirun -np $np --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64 #for multi-gpu echo mpirun -np $np --allow-run-as-root --hostfile hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single_process.sh $dist_url resnet50 64 mpirun -np $np --allow-run-as-root --hostfile hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single_process.sh $dist_url resnet50 64