#!/bin/bash
#SBATCH -p xahdnormal
#SBATCH -N 4
#SBATCH --cpus-per-task=8
#SBATCH --ntasks-per-node=4
#SBATCH --gres=dcu:4
#SBATCH -J llama
#SBATCH -t 72:00:00
#SBATCH -w c06r3n[06-09]
#SBATCH -o ./logs/%j.out
#SBATCH -e ./logs/%j.out

echo "START TIME: $(date)"
export NCCL_IB_TIMEOUT=22

ulimit -c 0
export XDG_CACHE_HOME=/work/home/liangjing/.cache
export HF_DATASETS_CACHE=/work/home/liangjing/.cache/huggingface/datasets

hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
#shuf ./hostfile/hostfile -o  ${hostfile}
#bash ./hostfile/nodelist_reset.sh ${hostfile} ${hostfile}_reset

#for i in `cat ${hostfile}_reset`
for i in `cat ${hostfile}`
do
    echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done

cp ./hostfile/hostfile-dl-$SLURM_JOB_ID hosts
bash run-full.sh
