#!/usr/bin/env bash #SBATCH -J 2node-test #SBATCH -p kshdnormal #SBATCH -N 2 #SBARCH -n 32 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=8 #SBATCH --gres=dcu:4 set -x HOME_PATH=/public/home/hepj WORK_PATH=${HOME_PATH}/torch/BERT/2node-run-squad source ~/env22.10.sh which python3 hostfile=./$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} for i in `cat $hostfile` do echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID ((num_node=${num_node}+1)) done num_dcu=$((${num_node}*4)) echo $num_dcu nodename=$(cat $hostfile |sed -n "1p") echo $nodename dist_url=`echo $nodename | awk '{print $1}'` export HSA_USERPTR_FOR_PAGED_MEM=0 mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process.sh $dist_url