Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
843cdbe0
Commit
843cdbe0
authored
Apr 18, 2019
by
Michael Carilli
Browse files
Merging in master
parents
724672d7
28097c99
Changes
56
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
1117 additions
and
116 deletions
+1117
-116
tests/L0/run_amp/test_multi_tensor_l2norm.py
tests/L0/run_amp/test_multi_tensor_l2norm.py
+80
-0
tests/L0/run_amp/test_multi_tensor_scale.py
tests/L0/run_amp/test_multi_tensor_scale.py
+1
-2
tests/L0/run_amp/test_multiple_models_optimizers_losses.py
tests/L0/run_amp/test_multiple_models_optimizers_losses.py
+762
-0
tests/L0/run_amp/test_scale.py
tests/L0/run_amp/test_scale.py
+0
-93
tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
+41
-0
tests/L0/run_test.py
tests/L0/run_test.py
+1
-1
tests/L1/common/compare.py
tests/L1/common/compare.py
+38
-12
tests/L1/common/main_amp.py
tests/L1/common/main_amp.py
+3
-0
tests/L1/common/run_test.sh
tests/L1/common/run_test.sh
+13
-6
tests/L1/cross_product/run.sh
tests/L1/cross_product/run.sh
+3
-1
tests/L1/cross_product_distributed/run.sh
tests/L1/cross_product_distributed/run.sh
+1
-1
tests/distributed/amp_master_params/amp_master_params.py
tests/distributed/amp_master_params/amp_master_params.py
+70
-0
tests/distributed/amp_master_params/compare.py
tests/distributed/amp_master_params/compare.py
+28
-0
tests/distributed/amp_master_params/run.sh
tests/distributed/amp_master_params/run.sh
+4
-0
tests/distributed/synced_batchnorm/test_groups.py
tests/distributed/synced_batchnorm/test_groups.py
+0
-0
tests/docker_extension_builds/run.sh
tests/docker_extension_builds/run.sh
+72
-0
No files found.
tests/L0/run_amp/test_multi_tensor_l2norm.py
0 → 100644
View file @
843cdbe0
import
unittest
import
functools
as
ft
import
itertools
as
it
from
apex
import
amp
import
torch
from
torch
import
nn
import
torch.nn.functional
as
F
from
utils
import
common_init
,
HALF
,
FLOAT
,
\
ALWAYS_HALF
,
ALWAYS_FLOAT
,
MATCH_INPUT
try
:
import
amp_C
from
amp_C
import
multi_tensor_l2norm
from
apex.multi_tensor_apply
import
MultiTensorApply
disabled
=
False
except
ImportError
as
err
:
print
(
"amp_C fused kernels unavailable, disabling TestMultiTensorApply. ImportError was "
,
err
)
disabled
=
True
class
TestMultiTensorL2Norm
(
unittest
.
TestCase
):
def
setUp
(
self
):
common_init
(
self
)
self
.
val
=
4.0
self
.
overflow_buf
=
torch
.
cuda
.
IntTensor
(
1
).
zero_
()
def
tearDown
(
self
):
pass
# The tensor creation here is written for convenience, not speed.
def
l2norm
(
self
,
sizea
,
sizeb
,
applier
,
repeat_tensors
,
in_type
):
self
.
overflow_buf
.
zero_
()
a
=
torch
.
cuda
.
FloatTensor
(
sizea
).
fill_
(
self
.
val
)
b
=
torch
.
cuda
.
FloatTensor
(
sizeb
).
fill_
(
self
.
val
)
in_list
=
[]
for
i
in
range
(
repeat_tensors
):
in_list
+=
[
a
.
clone
().
to
(
in_type
),
b
.
clone
().
to
(
in_type
)]
norm
=
applier
(
multi_tensor_l2norm
,
self
.
overflow_buf
,
[
in_list
])
reference
=
torch
.
cuda
.
FloatTensor
((
sizea
+
sizeb
)
*
repeat_tensors
).
fill_
(
self
.
val
).
norm
()
self
.
assertTrue
(
torch
.
allclose
(
norm
,
reference
))
self
.
assertTrue
(
self
.
overflow_buf
.
item
()
==
0
)
@
unittest
.
skipIf
(
disabled
,
"amp_C is unavailable"
)
def
test_fuzz
(
self
):
input_size_pairs
=
(
(
7777
*
77
,
555
*
555
),
(
777
,
555
),
(
555
,
2048
*
32
+
1
),
(
2048
*
32
+
1
,
555
),
(
555
,
2048
*
32
),
(
2048
*
32
,
555
),
(
33333
,
555
),
(
555
,
33333
))
appliers
=
(
MultiTensorApply
(
2048
*
32
),
MultiTensorApply
(
333
),
MultiTensorApply
(
33333
))
repeat_tensors
=
(
1
,
55
)
for
sizea
,
sizeb
in
input_size_pairs
:
for
applier
in
appliers
:
for
repeat
in
repeat_tensors
:
for
in_type
in
(
torch
.
float32
,
torch
.
float16
):
self
.
l2norm
(
sizea
,
sizeb
,
applier
,
repeat
,
in_type
,
)
if
__name__
==
'__main__'
:
unittest
.
main
()
tests/L0/run_amp/test_multi_tensor_scale.py
View file @
843cdbe0
...
@@ -24,12 +24,11 @@ except ImportError as err:
...
@@ -24,12 +24,11 @@ except ImportError as err:
class
TestMultiTensorScale
(
unittest
.
TestCase
):
class
TestMultiTensorScale
(
unittest
.
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
common_init
(
self
)
self
.
scale
=
4.0
self
.
scale
=
4.0
self
.
overflow_buf
=
torch
.
cuda
.
IntTensor
(
1
).
zero_
()
self
.
overflow_buf
=
torch
.
cuda
.
IntTensor
(
1
).
zero_
()
self
.
ref
=
torch
.
cuda
.
FloatTensor
([
1.0
])
self
.
ref
=
torch
.
cuda
.
FloatTensor
([
1.0
])
common_init
(
self
)
def
tearDown
(
self
):
def
tearDown
(
self
):
pass
pass
...
...
tests/L0/run_amp/test_multiple_models_optimizers_losses.py
0 → 100644
View file @
843cdbe0
This diff is collapsed.
Click to expand it.
tests/L0/run_amp/test_scale.py
deleted
100644 → 0
View file @
724672d7
import
unittest
import
functools
as
ft
import
itertools
as
it
from
apex
import
amp
import
torch
from
torch
import
nn
import
torch.nn.functional
as
F
from
utils
import
common_init
,
HALF
,
FLOAT
,
\
ALWAYS_HALF
,
ALWAYS_FLOAT
,
MATCH_INPUT
try
:
import
amp_C
scale_check_overflow
=
amp_C
.
scale_check_overflow
disabled
=
False
except
ImportError
as
err
:
print
(
"amp_C fused kernel unavailable, disabling TestScale. ImportError was "
,
err
)
disabled
=
True
class
TestScale
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
scale
=
128.0
self
.
nx
=
999
self
.
ny
=
888
self
.
overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
])
self
.
fp16
=
torch
.
ones
((
self
.
ny
,
self
.
nx
),
device
=
'cuda'
,
dtype
=
torch
.
float16
)
self
.
fp32
=
torch
.
ones
((
self
.
ny
,
self
.
nx
),
device
=
'cuda'
,
dtype
=
torch
.
float32
)
self
.
fp16_ref
=
torch
.
ones
((
1
,
1
),
device
=
'cuda'
,
dtype
=
torch
.
float16
)
self
.
fp32_ref
=
torch
.
ones
((
1
,
1
),
device
=
'cuda'
,
dtype
=
torch
.
float32
)
common_init
(
self
)
def
tearDown
(
self
):
pass
def
downscale_test
(
self
,
input
,
output
,
ref
):
self
.
overflow_buf
.
zero_
()
input
.
fill_
(
1.0
)
if
input
is
not
output
:
output
.
fill_
(
3.0
)
input
.
mul_
(
self
.
scale
)
scale_check_overflow
(
input
,
1.
/
self
.
scale
,
self
.
overflow_buf
,
output
)
self
.
assertTrue
(
torch
.
allclose
(
output
,
ref
))
self
.
assertTrue
(
self
.
overflow_buf
.
item
()
==
0
)
def
find_inf_test
(
self
,
input
,
output
,
ref
,
x
,
y
,
val
):
self
.
overflow_buf
.
zero_
()
input
.
fill_
(
1.0
)
if
input
is
not
output
:
output
.
fill_
(
3.0
)
input
[
x
,
y
]
=
val
scale_check_overflow
(
input
,
1.
/
self
.
scale
,
self
.
overflow_buf
,
output
)
self
.
assertTrue
(
self
.
overflow_buf
.
item
())
# Currently, the fused kernel gives a hard error if you attempt to downscale
# into fp16 output, which imo is the desired behavior. Maybe someday we
# will learn otherwise.
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp16_to_fp16(self):
# self.downscale_test(self.fp16, self.fp16, self.fp16_ref)
@
unittest
.
skipIf
(
disabled
,
"amp_C is unavailable"
)
def
test_fp16_to_fp32
(
self
):
self
.
downscale_test
(
self
.
fp16
,
self
.
fp32
,
self
.
fp32_ref
)
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp32_to_fp16(self):
# self.downscale_test(self.fp32, self.fp16, self.fp16_ref)
@
unittest
.
skipIf
(
disabled
,
"amp_C is unavailable"
)
def
test_fp32_to_fp32
(
self
):
self
.
downscale_test
(
self
.
fp32
,
self
.
fp32
,
self
.
fp32_ref
)
@
unittest
.
skipIf
(
disabled
,
"amp_C is unavailable"
)
def
test_fp16_to_fp32_find_inf_nan
(
self
):
self
.
find_inf_test
(
self
.
fp16
,
self
.
fp32
,
self
.
fp32_ref
,
0
,
0
,
float
(
'nan'
))
self
.
find_inf_test
(
self
.
fp16
,
self
.
fp32
,
self
.
fp32_ref
,
self
.
ny
//
2
,
self
.
nx
//
2
,
float
(
'inf'
))
self
.
find_inf_test
(
self
.
fp16
,
self
.
fp32
,
self
.
fp32_ref
,
self
.
ny
-
1
,
self
.
nx
-
1
,
float
(
'nan'
))
@
unittest
.
skipIf
(
disabled
,
"amp_C is unavailable"
)
def
test_fp32_to_fp32_find_inf_nan
(
self
):
self
.
find_inf_test
(
self
.
fp32
,
self
.
fp32
,
self
.
fp32_ref
,
0
,
0
,
float
(
'inf'
))
self
.
find_inf_test
(
self
.
fp32
,
self
.
fp32
,
self
.
fp32_ref
,
self
.
ny
//
2
,
self
.
nx
//
2
,
float
(
'nan'
))
self
.
find_inf_test
(
self
.
fp32
,
self
.
fp32
,
self
.
fp32_ref
,
self
.
ny
-
1
,
self
.
nx
-
1
,
float
(
'inf'
))
if
__name__
==
'__main__'
:
unittest
.
main
()
tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
0 → 100644
View file @
843cdbe0
import
unittest
import
os
import
random
import
torch
import
apex
class
TestFusedLayerNorm
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
module
=
apex
.
normalization
.
FusedLayerNorm
(
normalized_shape
=
[
32
,
64
],
elementwise_affine
=
False
)
self
.
input_
=
torch
.
randn
(
16
,
32
,
64
)
torch
.
cuda
.
manual_seed
(
42
)
def
forward_cpu
(
self
,
input_
):
self
.
module
.
cpu
()
return
self
.
module
(
input_
.
cpu
())
def
forward_cuda
(
self
,
input_
):
self
.
module
.
cuda
()
return
self
.
module
(
input_
.
cuda
())
def
test_forward_cuda
(
self
):
out_
=
self
.
forward_cuda
(
self
.
input_
)
assert
out_
.
is_cuda
==
True
def
test_forward_cpu
(
self
):
out_
=
self
.
forward_cpu
(
self
.
input_
)
assert
out_
.
is_cuda
==
False
def
test_same_output
(
self
):
out_cpu
=
self
.
forward_cpu
(
self
.
input_
)
out_cuda
=
self
.
forward_cuda
(
self
.
input_
)
torch
.
testing
.
assert_allclose
(
out_cpu
,
out_cuda
.
cpu
())
class
TestFusedLayerNormElemWise
(
TestFusedLayerNorm
):
def
setUp
(
self
):
self
.
module
=
apex
.
normalization
.
FusedLayerNorm
(
normalized_shape
=
[
32
,
64
],
elementwise_affine
=
True
)
self
.
input_
=
torch
.
randn
(
16
,
32
,
64
)
torch
.
cuda
.
manual_seed
(
42
)
\ No newline at end of file
tests/L0/run_test.py
View file @
843cdbe0
import
unittest
import
unittest
import
sys
import
sys
test_dirs
=
[
"run_amp"
,
"run_fp16util"
,
"run_mixed_adam"
]
test_dirs
=
[
"run_amp"
,
"run_fp16util"
,
"run_mixed_adam"
,
"run_fused_layer_norm"
]
runner
=
unittest
.
TextTestRunner
(
verbosity
=
2
)
runner
=
unittest
.
TextTestRunner
(
verbosity
=
2
)
...
...
tests/L1/common/compare.py
View file @
843cdbe0
...
@@ -6,6 +6,7 @@ parser.add_argument('--opt-level', type=str)
...
@@ -6,6 +6,7 @@ parser.add_argument('--opt-level', type=str)
parser
.
add_argument
(
'--keep-batchnorm-fp32'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--keep-batchnorm-fp32'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--loss-scale'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--loss-scale'
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--fused-adam'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--fused-adam'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_baseline'
,
action
=
'store_true'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
base_file
=
str
(
args
.
opt_level
)
+
"_"
+
\
base_file
=
str
(
args
.
opt_level
)
+
"_"
+
\
...
@@ -15,24 +16,49 @@ base_file = str(args.opt_level) + "_" +\
...
@@ -15,24 +16,49 @@ base_file = str(args.opt_level) + "_" +\
file_e
=
"True_"
+
base_file
file_e
=
"True_"
+
base_file
file_p
=
"False_"
+
base_file
file_p
=
"False_"
+
base_file
if
args
.
use_baseline
:
file_b
=
"baselines/True_"
+
base_file
dict_e
=
torch
.
load
(
file_e
)
dict_e
=
torch
.
load
(
file_e
)
dict_p
=
torch
.
load
(
file_p
)
dict_p
=
torch
.
load
(
file_p
)
if
args
.
use_baseline
:
dict_b
=
torch
.
load
(
file_b
)
torch
.
set_printoptions
(
precision
=
10
)
torch
.
set_printoptions
(
precision
=
10
)
print
(
file_e
)
print
(
file_e
)
print
(
file_p
)
print
(
file_p
)
if
args
.
use_baseline
:
print
(
file_b
)
for
n
,
(
i_e
,
i_p
)
in
enumerate
(
zip
(
dict_e
[
"Iteration"
],
dict_p
[
"Iteration"
])):
# ugly duplication here...
assert
i_e
==
i_p
,
"i_e = {}, i_p = {}"
.
format
(
i_e
,
i_p
)
if
not
args
.
use_baseline
:
for
n
,
(
i_e
,
i_p
)
in
enumerate
(
zip
(
dict_e
[
"Iteration"
],
dict_p
[
"Iteration"
])):
loss_e
=
dict_e
[
"Loss"
][
n
]
assert
i_e
==
i_p
,
"i_e = {}, i_p = {}"
.
format
(
i_e
,
i_p
)
loss_p
=
dict_p
[
"Loss"
][
n
]
assert
loss_e
==
loss_p
,
"Iteration {}, loss_e = {}, loss_p = {}"
.
format
(
i_e
,
loss_e
,
loss_p
)
loss_e
=
dict_e
[
"Loss"
][
n
]
print
(
"{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}"
.
format
(
loss_p
=
dict_p
[
"Loss"
][
n
]
i_e
,
assert
loss_e
==
loss_p
,
"Iteration {}, loss_e = {}, loss_p = {}"
.
format
(
i_e
,
loss_e
,
loss_p
)
loss_e
,
print
(
"{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}"
.
format
(
loss_p
,
i_e
,
dict_e
[
"Speed"
][
n
],
loss_e
,
dict_p
[
"Speed"
][
n
]))
loss_p
,
dict_e
[
"Speed"
][
n
],
dict_p
[
"Speed"
][
n
]))
else
:
for
n
,
(
i_e
,
i_p
)
in
enumerate
(
zip
(
dict_e
[
"Iteration"
],
dict_p
[
"Iteration"
])):
assert
i_e
==
i_p
,
"i_e = {}, i_p = {}"
.
format
(
i_e
,
i_p
)
loss_e
=
dict_e
[
"Loss"
][
n
]
loss_p
=
dict_p
[
"Loss"
][
n
]
loss_b
=
dict_b
[
"Loss"
][
n
]
assert
loss_e
==
loss_p
,
"Iteration {}, loss_e = {}, loss_p = {}"
.
format
(
i_e
,
loss_e
,
loss_p
)
assert
loss_e
==
loss_b
,
"Iteration {}, loss_e = {}, loss_b = {}"
.
format
(
i_e
,
loss_e
,
loss_b
)
print
(
"{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f}"
.
format
(
i_e
,
loss_b
,
loss_e
,
loss_p
,
dict_b
[
"Speed"
][
n
],
dict_e
[
"Speed"
][
n
],
dict_p
[
"Speed"
][
n
]))
tests/L1/common/main_amp.py
View file @
843cdbe0
...
@@ -365,6 +365,9 @@ def train(train_loader, model, criterion, optimizer, epoch):
...
@@ -365,6 +365,9 @@ def train(train_loader, model, criterion, optimizer, epoch):
batch_time
.
update
(
time
.
time
()
-
end
)
batch_time
.
update
(
time
.
time
()
-
end
)
end
=
time
.
time
()
end
=
time
.
time
()
# If you decide to refactor this test, like examples/imagenet, to sample the loss every
# print_freq iterations, make sure to move this prefetching below the accuracy calculation.
input
,
target
=
prefetcher
.
next
()
input
,
target
=
prefetcher
.
next
()
if
i
%
args
.
print_freq
==
0
and
i
>
1
:
if
i
%
args
.
print_freq
==
0
and
i
>
1
:
...
...
tests/L1/common/run_test.sh
View file @
843cdbe0
...
@@ -6,8 +6,15 @@ print_banner() {
...
@@ -6,8 +6,15 @@ print_banner() {
print_banner
"Distributed status:
$1
"
print_banner
"Distributed status:
$1
"
# DATADIR="/home/mcarilli/Desktop/pt18data/apex/examples/imagenet/bare_metal_train_val/"
echo
$2
DATADIR
=
"/opt/home/apex/examples/imagenet/"
DATADIR
=
$2
if
[
-n
"
$3
"
]
then
USE_BASELINE
=
""
else
USE_BASELINE
=
"--use_baseline"
fi
if
[
"
$1
"
==
"single_gpu"
]
if
[
"
$1
"
==
"single_gpu"
]
then
then
...
@@ -49,7 +56,7 @@ set -e
...
@@ -49,7 +56,7 @@ set -e
print_banner
"Installing Apex with --cuda_ext and --cpp_ext"
print_banner
"Installing Apex with --cuda_ext and --cpp_ext"
pushd
../../..
pushd
../../..
p
ython setup.py
install
--cuda
_ext
--
cpp
_ext
p
ip
install
-v
--no-cache-dir
--global-option
=
"--cpp
_ext
"
--
global-option
=
"--cuda
_ext
"
.
popd
popd
for
opt_level
in
"
${
opt_levels
[@]
}
"
for
opt_level
in
"
${
opt_levels
[@]
}
"
...
@@ -86,7 +93,7 @@ done
...
@@ -86,7 +93,7 @@ done
print_banner
"Reinstalling apex without extensions"
print_banner
"Reinstalling apex without extensions"
pushd
../../..
pushd
../../..
p
ython setup.py
install
p
ip
install
-v
--no-cache-dir
.
popd
popd
for
opt_level
in
"
${
opt_levels
[@]
}
"
for
opt_level
in
"
${
opt_levels
[@]
}
"
...
@@ -124,7 +131,7 @@ do
...
@@ -124,7 +131,7 @@ do
fi
fi
echo
"
${
BASE_CMD
}
--opt-level
${
opt_level
}
${
loss_scale
}
${
keep_batchnorm
}
[--has-ext]
$DATADIR
"
echo
"
${
BASE_CMD
}
--opt-level
${
opt_level
}
${
loss_scale
}
${
keep_batchnorm
}
[--has-ext]
$DATADIR
"
set
-x
set
-x
python compare.py
--opt-level
${
opt_level
}
${
loss_scale
}
${
keep_batchnorm
}
python compare.py
--opt-level
${
opt_level
}
${
loss_scale
}
${
keep_batchnorm
}
--use_baseline
set
+x
set
+x
done
done
done
done
...
@@ -133,5 +140,5 @@ done
...
@@ -133,5 +140,5 @@ done
print_banner
"Reinstalling Apex with --cuda_ext and --cpp_ext"
print_banner
"Reinstalling Apex with --cuda_ext and --cpp_ext"
pushd
../../..
pushd
../../..
p
ython setup.py
install
--cuda
_ext
--
cpp
_ext
p
ip
install
-v
--no-cache-dir
--global-option
=
"--cpp
_ext
"
--
global-option
=
"--cuda
_ext
"
.
popd
popd
tests/L1/cross_product/run.sh
View file @
843cdbe0
#!/bin/bash
#!/bin/bash
DATADIR
=
"/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
# DATADIR="/opt/home/apex/examples/imagenet/"
cp
../common/
*
.
cp
../common/
*
.
bash run_test.sh single_gpu
bash run_test.sh single_gpu
$1
$DATADIR
yes
tests/L1/cross_product_distributed/run.sh
View file @
843cdbe0
#!/bin/bash
#!/bin/bash
cp
../common/
*
.
cp
../common/
*
.
bash run_test.sh distributed
bash run_test.sh distributed
$1
tests/distributed/amp_master_params/amp_master_params.py
0 → 100644
View file @
843cdbe0
import
torch
import
argparse
import
os
from
apex
import
amp
# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
from
apex.parallel
import
DistributedDataParallel
parser
=
argparse
.
ArgumentParser
()
# FOR DISTRIBUTED: Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch.
parser
.
add_argument
(
"--local_rank"
,
default
=
0
,
type
=
int
)
args
=
parser
.
parse_args
()
# FOR DISTRIBUTED: If we are running under torch.distributed.launch,
# the 'WORLD_SIZE' environment variable will also be set automatically.
args
.
distributed
=
False
if
'WORLD_SIZE'
in
os
.
environ
:
args
.
distributed
=
int
(
os
.
environ
[
'WORLD_SIZE'
])
>
1
if
args
.
distributed
:
# FOR DISTRIBUTED: Set the device according to local_rank.
torch
.
cuda
.
set_device
(
args
.
local_rank
)
# FOR DISTRIBUTED: Initialize the backend. torch.distributed.launch will provide
# environment variables, and requires that you use init_method=`env://`.
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
'env://'
)
torch
.
manual_seed
(
torch
.
distributed
.
get_rank
())
torch
.
backends
.
cudnn
.
benchmark
=
True
N
,
D_in
,
D_out
=
64
,
1024
,
16
# Each process receives its own batch of "fake input data" and "fake target data."
# The "training loop" in each process just uses this fake batch over and over.
# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
# example of distributed data sampling for both training and validation.
x
=
torch
.
randn
(
N
,
D_in
,
device
=
'cuda'
)
y
=
torch
.
randn
(
N
,
D_out
,
device
=
'cuda'
)
model
=
torch
.
nn
.
Linear
(
D_in
,
D_out
).
cuda
()
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
1e-3
)
model
,
optimizer
=
amp
.
initialize
(
model
,
optimizer
,
opt_level
=
"O2"
)
if
args
.
distributed
:
# FOR DISTRIBUTED: After amp.initialize, wrap the model with
# apex.parallel.DistributedDataParallel.
model
=
DistributedDataParallel
(
model
)
# torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
# model = torch.nn.parallel.DistributedDataParallel(model,
# device_ids=[args.local_rank],
# output_device=args.local_rank)
loss_fn
=
torch
.
nn
.
MSELoss
()
for
t
in
range
(
500
):
optimizer
.
zero_grad
()
y_pred
=
model
(
x
)
loss
=
loss_fn
(
y_pred
,
y
)
with
amp
.
scale_loss
(
loss
,
optimizer
)
as
scaled_loss
:
scaled_loss
.
backward
()
optimizer
.
step
()
if
args
.
local_rank
==
0
:
print
(
"final loss = "
,
loss
)
torch
.
save
(
list
(
model
.
parameters
()),
"rank{}model.pth"
.
format
(
torch
.
distributed
.
get_rank
()))
torch
.
save
(
list
(
amp
.
master_params
(
optimizer
)),
"rank{}master.pth"
.
format
(
torch
.
distributed
.
get_rank
()))
tests/distributed/amp_master_params/compare.py
0 → 100644
View file @
843cdbe0
import
torch
model_params_rank0
=
torch
.
load
(
"rank0model.pth"
,
map_location
=
lambda
storage
,
loc
:
storage
.
cuda
(
0
))
model_params_rank1
=
torch
.
load
(
"rank1model.pth"
,
map_location
=
lambda
storage
,
loc
:
storage
.
cuda
(
0
))
master_params_rank0
=
torch
.
load
(
"rank0master.pth"
,
map_location
=
lambda
storage
,
loc
:
storage
.
cuda
(
0
))
master_params_rank1
=
torch
.
load
(
"rank1master.pth"
,
map_location
=
lambda
storage
,
loc
:
storage
.
cuda
(
0
))
for
model_rank0
,
model_rank1
,
master_rank0
,
master_rank1
in
zip
(
model_params_rank0
,
model_params_rank1
,
master_params_rank0
,
master_params_rank1
):
assert
torch
.
allclose
(
model_rank0
,
model_rank1
),
"Model param mismatch"
assert
torch
.
allclose
(
master_rank0
,
master_rank1
),
"Master param mismatch"
# Some debugging/investigation assistance code:
# maxval, maxind = torch.max(((torch.abs(model_rank0).float())/torch.abs(master_rank0)).view(-1), 0)
# offending_val_half = model_rank0.view(-1)[maxind.item()]
# offending_val_float = master_rank0.view(-1)[maxind.item()]
# print(maxval.item(), maxind.item(), offending_val_half.item(), offending_val_float.item(),
# offending_val_float.half().item())
# rtol needs to be > 2^-11 because of denormals...
assert
torch
.
allclose
(
model_rank0
,
master_rank0
.
half
(),
rtol
=
.
005
),
"Model-master mismatch"
print
(
"OK: Model and master params match across ranks."
)
tests/distributed/amp_master_params/run.sh
0 → 100644
View file @
843cdbe0
#!/bin/bash
python
-m
torch.distributed.launch
--nproc_per_node
=
2 amp_master_params.py
python compare.py
tests/synced_batchnorm/test_groups.py
→
tests/
distributed/
synced_batchnorm/test_groups.py
View file @
843cdbe0
File moved
tests/docker_extension_builds/run.sh
0 → 100644
View file @
843cdbe0
This diff is collapsed.
Click to expand it.
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment