#!/bin/bash #SBATCH -p kshdnormal01 #SBATCH -N 4 #SBATCH --cpus-per-task=1 #SBATCH --ntasks-per-node=32 #SBATCH --mem 100G #SBATCH --gres=dcu:4 #SBATCH -J chatglm #SBATCH -o logs/pt-%j.out #SBATCH -e logs/pt-%j.err ulimit -u 200000 export OMP_NUM_THREADS=1 export NCCL_DEBUG=INFO export MIOPEN_FIND_MODE=3 export HSA_FORCE_FINE_GRAIN_PCIE=1 export MIOPEN_COMPILE_PARALLEL_LEVEL=1 export NCCL_PLUGIN_P2P=ucx export NCCL_SOCKET_IFNAME=ib0 export NCCL_P2P_LEVEL=5 export NCCL_NET_PLUGIN=none unset RCCL_NCHANNELS unset NCCL_NET_GDR_LEVEL rm -rf ./hostfile/* echo "START TIME: $(date)" hostfile=./hostfile/$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} for i in `cat $hostfile` do echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID done np=$(cat $hostfile|sort|uniq |wc -l) np=$(($np*4)) nodename=$(cat $hostfile |sed -n "1p") dist_url=`echo $nodename | awk '{print $1}'` echo ${dist_url} mpirun -np $np --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/run_train_single.sh $dist_url