dcu.sh 1.65 KB
Newer Older
liangjing's avatar
liangjing committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
## DL params
export BATCHSIZE=16
export NUMEPOCHS=6
export DATASET_DIR="/data/OpenImages_mlperf"
export EXTRA_PARAMS='--lr 0.000085 --warmup-epochs 0 --frozen-bn-opt --frozen-bn-fp16 --apex-adam --disable-ddp-broadcast-buffers --fp16-allreduce --skip-metric-loss --async-coco'

# Set variables
EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
LOG_INTERVAL=${LOG_INTERVAL:-20}
TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"}

# run benchmark
echo "running benchmark"

PARAMS=(
--batch-size "${BATCHSIZE}"
--eval-batch-size "${EVALBATCHSIZE}"
--epochs "${NUMEPOCHS}"
--print-freq "${LOG_INTERVAL}"
--dataset-path "${DATASET_DIR}"
--local_rank "${comm_rank}"
--world-size "${comm_size}"
)

# run training
APP="python3 train.py ${PARAMS[@]} ${EXTRA_PARAMS}"

case ${lrank} in
[0])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=0 --membind=0 ${APP}
  ;;
[1])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=1 --membind=1 ${APP}
  ;;
[2])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=2 --membind=2 ${APP}
  ;;
[3])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=3 --membind=3 ${APP}
  ;;
[4])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=4 --membind=4 ${APP}
  ;;
[5])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=5 --membind=5 ${APP}
  ;;
[6])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=6 --membind=6 ${APP}
  ;;
[7])
  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
  numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
esac