"vscode:/vscode.git/clone" did not exist on "bdefabd1a8f155235ee4b65c91ef96ce60602c51"
single_process.sh 1.31 KB
Newer Older
huchen's avatar
huchen committed
1
2
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
qianyj's avatar
qianyj committed
3
APP=" python3 ./benchmarks-master/scripts/tf_cnn_benchmarks/tf_cnn_benchmarks.py --data_format=NCHW --batch_size=128 --model=resnet50  --optimizer=momentum --variable_update=horovod  --print_training_accuracy=true  --eval_during_training_every_n_epochs=1  --nodistortions --num_gpus=1 --num_epochs=90 --weight_decay=1e-4 --data_dir=$data_dir_path   --use_fp16=False --data_name=imagenet --train_dir=$save_checkpoint_path
huchen's avatar
huchen committed
4
5
6
7
8
case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_0:1
  export UCX_IB_PCI_BW=mlx5_0:50Gbs
qianyj's avatar
qianyj committed
9
  echo numactl --cpunodebind=0 --membind=0 ${APP}
huchen's avatar
huchen committed
10
11
12
13
14
15
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_1:1
  export UCX_IB_PCI_BW=mlx5_1:50Gbs
qianyj's avatar
qianyj committed
16
  echo numactl --cpunodebind=1 --membind=1 ${APP}
huchen's avatar
huchen committed
17
18
19
20
21
22
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_2:1
  export UCX_IB_PCI_BW=mlx5_2:50Gbs
qianyj's avatar
qianyj committed
23
  echo numactl --cpunodebind=2 --membind=2 ${APP}
huchen's avatar
huchen committed
24
25
26
27
28
29
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3
  export UCX_NET_DEVICES=mlx5_3:1
  export UCX_IB_PCI_BW=mlx5_3:50Gbs
qianyj's avatar
qianyj committed
30
  echo numactl --cpunodebind=3 --membind=3 ${APP}
huchen's avatar
huchen committed
31
32
33
  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
esac