#!/bin/bash #SBATCH -p xahdnormal #SBATCH -N 4 #SBATCH --cpus-per-task=8 #SBATCH --ntasks-per-node=4 #SBATCH --gres=dcu:4 #SBATCH -J llama #SBATCH -t 72:00:00 #SBATCH -w c06r3n[06-09] #SBATCH -o ./logs/%j.out #SBATCH -e ./logs/%j.out echo "START TIME: $(date)" export NCCL_IB_TIMEOUT=22 ulimit -c 0 export XDG_CACHE_HOME=/work/home/liangjing/.cache export HF_DATASETS_CACHE=/work/home/liangjing/.cache/huggingface/datasets hostfile=./hostfile/$SLURM_JOB_ID scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} #shuf ./hostfile/hostfile -o ${hostfile} #bash ./hostfile/nodelist_reset.sh ${hostfile} ${hostfile}_reset #for i in `cat ${hostfile}_reset` for i in `cat ${hostfile}` do echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID done cp ./hostfile/hostfile-dl-$SLURM_JOB_ID hosts bash run-full.sh