#!/bin/bash
#SBATCH -J 23.04
#SBATCH -p wzhdexclu10
#SBATCH -N 2
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
#SBATCH -o ./log/%j-dtk23.04-zero3-2node.out
#SBATCH -e ./log/%j-dtk23.04-zero3-2node.out
#SBATCH --exclusive 

source /work/home/hepj/job_env/torch-dtk23.04-py39/bin/activate
source /work/home/hepj/env-source/env23.04.sh
module purge
module load compiler/devtoolset/7.3.1
module load mpi/hpcx/gcc-7.3.1
# module load compiler/dtk/23.04
export HIP_VISIBLE_DEVICES=0,1,2,3 
export MIOPEN_FIND_MODE=3
export MIOPEN_DEBUG_CONV_IMPLICIT_GEMM=0 
export MIOPEN_USER_DB_PATH=/tmp/miopen-udb
export MIOPEN_CUSTOM_CACHE_DIR=/tmp/miopen-cache
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_IB_HCA=mlx5

export HSA_FORCE_FINE_GRAIN_PCIE=1 

#export MKL_NUM_THREADS=1

ulimit -s unlimited 

nodes=($(scontrol show hostnames $SLURM_JOB_NODELIST ))
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
echo Node IP: $head_node_ip 
echo headnode: $head_node 
NODE_RANK=$SLURM_NODEID


hostfile=./hostfiles/$SLURM_JOB_ID #获取节点号
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile} #��入从 hostfile
rm `pwd`/hostfiles/hostfile-dl -f

for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfiles/hostfile-dl-$SLURM_JOB_ID #节点号
done
np=$(cat $hostfile|sort|uniq |wc -l)  #节点去重

np=$(($np*4))

nodename=$(cat $hostfile |sed -n "1p") #读取每行节点 第一个是主节点
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`

mpirun -np $np --allow-run-as-root --hostfile ./hostfiles/hostfile-dl-$SLURM_JOB_ID --bind-to none ./single_ddp.sh  $dist_url
                                                                                                  

