run_test.sh 3.46 KB
Newer Older
1
2
3
4
5
6
#!/bin/bash

print_banner() {
  printf "\n\n\n\e[30m\e[42m$1\e[0m\n\n\n\n"
}

7
8
print_banner "Distributed status:  $1"

9
echo $2
10
11
12
DATADIR=$2

if [ -n "$3" ]
13
then
14
  USE_BASELINE=""
15
else
16
  USE_BASELINE="--use_baseline"
17
fi
18
19
20
21
22
23
24
25
26
27
28
29
30

if [ "$1" == "single_gpu" ]
then
  BASE_CMD="python main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
fi

if [ "$1" == "distributed" ]
then
  BASE_CMD="python -m torch.distributed.launch --nproc_per_node=2 main_amp.py -a resnet50 --b 128 --workers 4 --deterministic --prints-to-process 5"
fi

ADAM_ARGS="--opt-level O2 --keep-batchnorm-fp32 False --fused-adam"

31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
keep_batchnorms=(
""
"--keep-batchnorm-fp32 True"
"--keep-batchnorm-fp32 False"
)

loss_scales=(
""
"--loss-scale 1.0"
"--loss-scale 128.0"
"--loss-scale dynamic"
)

opt_levels=(
"O0"
"O1"
"O2"
"O3"
)

rm True*
rm False*

set -e

Michael Carilli's avatar
Michael Carilli committed
56
57
print_banner "Installing Apex with --cuda_ext and --cpp_ext"

58
pushd ../../..
Michael Carilli's avatar
Michael Carilli committed
59
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
60
61
62
63
64
65
66
67
popd

for opt_level in "${opt_levels[@]}"
do
  for loss_scale in "${loss_scales[@]}"
  do
    for keep_batchnorm in "${keep_batchnorms[@]}"
    do
68
69
70
71
72
      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
      then
        print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
        continue
      fi
73
      print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR"
74
      set -x
75
      ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --has-ext $DATADIR
76
77
78
79
80
      set +x
    done
  done
done

81
82
83
84
85
86
87
88
89
90
91
92
# Handle FusedAdam separately due to limited support.
# FusedAdam will not be tested for bitwise accuracy against the Python implementation.
# The L0 tests already do so.  These tests are here to ensure that it actually runs,
# and get an idea of performance.
for loss_scale in "${loss_scales[@]}"
do
  print_banner "${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR"
  set -x
  ${BASE_CMD} ${ADAM_ARGS} ${loss_scale} --has-ext $DATADIR
  set +x
done

Michael Carilli's avatar
Michael Carilli committed
93
94
print_banner "Reinstalling apex without extensions"

95
pushd ../../..
Michael Carilli's avatar
Michael Carilli committed
96
pip install -v --no-cache-dir .
97
98
99
100
101
102
103
104
popd

for opt_level in "${opt_levels[@]}"
do
  for loss_scale in "${loss_scales[@]}"
  do
    for keep_batchnorm in "${keep_batchnorms[@]}"
    do
105
106
107
108
109
      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
      then
        print_banner "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
        continue
      fi
110
      print_banner "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR"
111
      set -x
112
      ${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} $DATADIR
113
114
115
116
117
      set +x
    done
  done
done

118
119
print_banner "Checking for bitwise accuracy between Python-only and cpp/cuda extension installs"

120
121
122
123
124
125
for opt_level in "${opt_levels[@]}"
do
  for loss_scale in "${loss_scales[@]}"
  do
    for keep_batchnorm in "${keep_batchnorms[@]}"
    do
Michael Carilli's avatar
Michael Carilli committed
126
      echo ""
127
128
129
130
131
      if [ "$opt_level" == "O1" ] && [ -n "${keep_batchnorm}" ]
      then
        echo "Skipping ${opt_level} ${loss_scale} ${keep_batchnorm}"
        continue
      fi
Michael Carilli's avatar
Michael Carilli committed
132
      echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
133
      set -x
134
      python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline
135
136
137
138
139
      set +x
    done
  done
done

Michael Carilli's avatar
Michael Carilli committed
140
141
print_banner "Reinstalling Apex with --cuda_ext and --cpp_ext"

142
pushd ../../..
Michael Carilli's avatar
Michael Carilli committed
143
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
144
popd