single_process.sh 1.33 KB
Newer Older
qianyj's avatar
qianyj committed
1
#!/bin/bash
qianyj's avatar
qianyj committed
2

qianyj's avatar
qianyj committed
3
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
qianyj's avatar
qianyj committed
4
5
6
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE

qianyj's avatar
qianyj committed
7
APP=" python3 ./benchmarks-master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_format=NCHW --batch_size=128 --model=resnet50  --optimizer=momentum --variable_update=horovod  --print_training_accuracy=true  --nodistortions --num_gpus=1 --num_epochs=90 --weight_decay=1e-4 --data_dir=$data_dir_path   --use_fp16=False --data_name=imagenet --train_dir=$save_checkpoint_path"
qianyj's avatar
qianyj committed
8

qianyj's avatar
qianyj committed
9
case ${lrank} in
qianyj's avatar
qianyj committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_0:1
  export UCX_IB_PCI_BW=mlx5_0:50Gbs
  echo numactl --cpunodebind=0 --membind=0 ${APP}
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_1:1
  export UCX_IB_PCI_BW=mlx5_1:50Gbs
  echo numactl --cpunodebind=1 --membind=1 ${APP}
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_2:1
  export UCX_IB_PCI_BW=mlx5_2:50Gbs
  echo numactl --cpunodebind=2 --membind=2 ${APP}
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_3:1
  export UCX_IB_PCI_BW=mlx5_3:50Gbs
  echo numactl --cpunodebind=3 --membind=3 ${APP}
  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
esac