#!/bin/bash #SBATCH -p wzhdexclu10 #SBATCH -N 2 #SBATCH --cpus-per-task=8 #SBATCH --ntasks-per-node=4 #SBATCH --mem 50G #SBATCH --gres=dcu:4 #SBATCH -J data2vec #SBATCH -o logs/%x-%j.txt #SBATCH -e logs/%x-%j.txt ulimit -u 200000 # export NCCL_DEBUG=INFO export NCCL_IB_HCA=mlx5 export NCCL_IB_DISABLE=0 export NCCL_SOCKET_IFNAME=ib0 export HSA_FORCE_FINE_GRAIN_PCIE=1 export MIOPEN_FIND_MODE=3 export OMP_NUM_THREADS=1 echo "START TIME: $(date)" module purge module load compiler/devtoolset/7.3.1 module load mpi/hpcx/gcc-7.3.1 module load compiler/dtk/23.04 # source /opt/dtk-23.04/env.sh source /public/home/chenzk/dtk-23.04/env.sh module list which mpirun # load env source /public/home/chenzk/anaconda3/bin/activate fairseq # conda activate fairseq which python3 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/public/home/chenzk/anaconda3/envs/fairseq/lib #source activate fairseq export PYTHON=python3 export NPROC_PER_NODE=4 rm -f ./hostfile/* rm -f core.* dir="./hostfile" if [ ! -d "$dir" ];then mkdir $dir echo "$dir created successfully" else echo "$dir already existed" fi dir="./checkpoints" if [ ! -d "$dir" ];then mkdir $dir echo "$dir created successfully" else echo "$dir already existed" fi hostfile=./hostfile/$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} for i in `cat $hostfile` do echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID done np=$(cat $hostfile|sort|uniq |wc -l) np=$(($np*4)) nodename=$(cat $hostfile |sed -n "1p") dist_url=`echo $nodename | awk '{print $1}'` mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/pretrain_datavec_mpi.sh $dist_url