#!/usr/bin/env bash #SBATCH -J 2node-test #SBATCH -p wzhdtest #SBATCH -N 2 #SBARCH -n 32 #SBATCH --ntasks-per-node=4 #SBATCH --cpus-per-task=8 #SBATCH --gres=dcu:4 set -x HOME_PATH=/work/home/hepj WORK_PATH=${HOME_PATH}/torch/bert-pretrain/2node-run source ~/env22.10.sh which python3 hostfile=./$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} for i in `cat $hostfile` do echo ${i} slots=4 >> `pwd`/hostfile-$SLURM_JOB_ID ((num_node=${num_node}+1)) done num_dcu=$((${num_node}*4)) echo $num_dcu nodename=$(cat $hostfile |sed -n "1p") echo $nodename dist_url=`echo $nodename | awk '{print $1}'` export NCCL_DEBUG=INFO export HSA_USERPTR_FOR_PAGED_MEM=0 mpirun -np ${num_dcu} --hostfile hostfile-$SLURM_JOB_ID ${WORK_PATH}/2nodes_single_process_pre2.sh $dist_url