run_mpi.sh 902 Bytes
Newer Older
Sugon_ldc's avatar
Sugon_ldc committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/bin/bash
#SBATCH -p wzhdexclu03
#SBATCH -N 2 
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --gres=dcu:4
#SBATCH -J multi_machine_dcu
#SBATCH -o logs/pt-%j.out
#SBATCH -e logs/pt-%j.err

echo "START TIME: $(date)"
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}


for i in `cat $hostfile`
do
    echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
echo ${dist_url}

source ~/miniconda3/etc/profile.d/conda.sh
conda activate torch1.10-dtk23.04.1-py38

#conda activate base
module purge
module load compiler/devtoolset/7.3.1  mpi/hpcx/gcc-7.3.1  compiler/dtk/23.04.1
module list

mpirun -np $np --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single_process.sh $dist_url