mpi_slurm.sbatch 1.44 KB
Newer Older
huchen's avatar
huchen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash
#SBATCH -p caspra
#SBATCH -N 2
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=8
#SBATCH --gres=dcu:4
#SBATCH -J syn_nccl_torch
#SBATCH -o ./log/output.%j
#SBATCH -e ./log/output.%j
#SBATCH -x e01r1n07

#for rocm3.3
module rm compiler/rocm/2.9
module load apps/PyTorch/1.5.0a0/hpcx-2.4.1-gcc-7.3.1-rocm3.3
export MIOPEN_DEBUG_DISABLE_FIND_DB=1
which mpirun
which python3

hostfile=./$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
rm `pwd`/hostfile-dl -f

for i in `cat $hostfile`
do
    echo ${i} slots=4 >> `pwd`/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)

np=$(($np*4))

nodename=$(cat $hostfile |sed -n "1p")
echo $nodename
dist_url=`echo $nodename | awk '{print $1}'`


#for single card
#echo mpirun -np 1 --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#mpirun -np 1 --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64

#for one node
#echo mpirun -np $np --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64
#mpirun -np $np --allow-run-as-root --bind-to none `pwd`/single_process.sh $dist_url inception_v3 64

#for multi-gpu
echo mpirun -np $np --allow-run-as-root --hostfile hostfile-dl-$SLURM_JOB_ID  --bind-to none `pwd`/single_process.sh $dist_url resnet50 64
mpirun -np $np --allow-run-as-root --hostfile hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single_process.sh $dist_url resnet50 64