single_process.sh 982 Bytes
Newer Older
panning's avatar
panning committed
1
2
3
4
5
6
7
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1

lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE

panning's avatar
panning committed
8
APP="python3 ../train.py --batch-size=${3} --arch=${2} -j 6 --epochs=90 --dist-url tcp://${1}:34567 --dist-backend nccl --world-size=${comm_size} --rank=${comm_rank} --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/"
panning's avatar
panning committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31

case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  echo numactl --cpunodebind=0 --membind=0 ${APP}
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  echo numactl --cpunodebind=1 --membind=1 ${APP}
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  echo numactl --cpunodebind=2 --membind=2 ${APP} 
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  echo numactl --cpunodebind=3 --membind=3 ${APP}
  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
esac