mpi_slurm.sbatch 893 Bytes
Newer Older
huchen's avatar
huchen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/bin/bash
#SBATCH -p normal
#SBATCH -N 1
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
#SBATCH -J detection
#SBATCH -o ./log/output.%j
#SBATCH -e ./log/output.%j

module rm compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3

which mpirun
which python3
hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
rm `pwd`/hostfile-dl -f

#hostfile=./node_list
for i in `cat $hostfile`
do
    echo ${i} slots=4 >> `pwd`/hostfile-dl
done
np=$(cat $hostfile|sort|uniq |wc -l)

np=$(($np*4))

nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`


echo mpirun -np $np --allow-run-as-root --hostfile hostfile-dl  --bind-to none `pwd`/single_process.sh $dist_url
mpirun -np $np --allow-run-as-root --hostfile hostfile-dl  --bind-to none `pwd`/single_process.sh $dist_url