multi-dcu.sh 475 Bytes
Newer Older
huchen's avatar
huchen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#!/bin/bash

which python3
which mpirun
hostfile=./hostfile
for i in `cat ./${hostfile}`
do
echo ${i} slots=4 >> ./hostfile-dl-$SLURM_JOB_ID    
((num_node=${num_node}+1))
done
echo "resnet50 node is " ${num_node}
((num_DCU=${num_node}*4))
 
#multi-node multi-gpu
mpirun -np $num_DCU --allow-run-as-root --hostfile `pwd`/hostfile --bind-to none `pwd`/single_process.sh

#single-node multi-gpu 
#mpirun -np $num_DCU --allow-run-as-root --bind-to none `pwd`/single_process.sh