Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
85b56d01
Unverified
Commit
85b56d01
authored
Jan 19, 2021
by
Jeff Daily
Committed by
GitHub
Jan 19, 2021
Browse files
Merge pull request #43 from ROCmSoftwarePlatform/IFU-2021-01-18
IFU-2021-01-18
parents
d061bf20
13c8d152
Changes
31
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
494 additions
and
270 deletions
+494
-270
apex/parallel/optimized_sync_batchnorm_kernel.py
apex/parallel/optimized_sync_batchnorm_kernel.py
+18
-18
csrc/multi_tensor_apply.cuh
csrc/multi_tensor_apply.cuh
+10
-7
csrc/multi_tensor_l2norm_kernel.cu
csrc/multi_tensor_l2norm_kernel.cu
+3
-2
csrc/multi_tensor_sgd_kernel.cu
csrc/multi_tensor_sgd_kernel.cu
+2
-0
csrc/welford.cu
csrc/welford.cu
+7
-3
examples/imagenet/main_amp.py
examples/imagenet/main_amp.py
+2
-1
setup.py
setup.py
+116
-73
tests/L0/run_optimizers/test_adagrad.py
tests/L0/run_optimizers/test_adagrad.py
+0
-134
tests/L0/run_optimizers/test_dist_adam.py
tests/L0/run_optimizers/test_dist_adam.py
+183
-0
tests/L0/run_optimizers/test_fused_optimizer.py
tests/L0/run_optimizers/test_fused_optimizer.py
+131
-22
tests/L0/run_optimizers/test_lamb.py
tests/L0/run_optimizers/test_lamb.py
+22
-10
No files found.
apex/parallel/optimized_sync_batchnorm_kernel.py
View file @
85b56d01
...
@@ -21,33 +21,31 @@ class SyncBatchnormFunction(Function):
...
@@ -21,33 +21,31 @@ class SyncBatchnormFunction(Function):
if
channel_last
:
if
channel_last
:
count
=
int
(
input
.
numel
()
/
input
.
size
(
-
1
))
count
=
int
(
input
.
numel
()
/
input
.
size
(
-
1
))
mean
,
var_biased
=
syncbn
.
welford_mean_var_c_last
(
input
)
mean
,
var_biased
=
syncbn
.
welford_mean_var_c_last
(
input
)
num_channels
=
input
.
size
(
-
1
)
else
:
else
:
count
=
int
(
input
.
numel
()
/
input
.
size
(
1
))
count
=
int
(
input
.
numel
()
/
input
.
size
(
1
))
mean
,
var_biased
=
syncbn
.
welford_mean_var
(
input
)
mean
,
var_biased
=
syncbn
.
welford_mean_var
(
input
)
num_channels
=
input
.
size
(
1
)
if
torch
.
distributed
.
is_initialized
():
if
torch
.
distributed
.
is_initialized
():
if
not
process_group
:
if
not
process_group
:
process_group
=
torch
.
distributed
.
group
.
WORLD
process_group
=
torch
.
distributed
.
group
.
WORLD
device
=
mean
.
device
device
=
mean
.
device
world_size
=
torch
.
distributed
.
get_world_size
(
process_group
)
world_size
=
torch
.
distributed
.
get_world_size
(
process_group
)
mean_all
=
torch
.
empty
(
world_size
,
mean
.
size
(
0
),
dtype
=
mean
.
dtype
,
device
=
device
)
var_all
=
torch
.
empty
(
world_size
,
var_biased
.
size
(
0
),
dtype
=
var_biased
.
dtype
,
device
=
device
)
count_t
=
torch
.
empty
(
1
,
dtype
=
mean
.
dtype
,
device
=
mean
.
device
).
fill_
(
count
)
count_all
=
torch
.
cuda
.
IntTensor
(
world_size
,
device
=
device
)
combined
=
torch
.
cat
([
mean
.
view
(
-
1
),
var_biased
.
view
(
-
1
),
count_t
],
dim
=
0
)
mean_l
=
[
mean_all
.
narrow
(
0
,
i
,
1
)
for
i
in
range
(
world_size
)]
combined_list
=
[
torch
.
empty_like
(
combined
)
for
k
in
range
(
world_size
)]
var_l
=
[
var_all
.
narrow
(
0
,
i
,
1
)
for
i
in
range
(
world_size
)]
torch
.
distributed
.
all_gather
(
combined_list
,
combined
,
process_group
)
count_l
=
[
count_all
.
narrow
(
0
,
i
,
1
)
for
i
in
range
(
world_size
)]
combined
=
torch
.
stack
(
combined_list
,
dim
=
0
)
torch
.
distributed
.
all_gather
(
mean_l
,
mean
,
process_group
)
mean_all
,
invstd_all
,
count_all
=
torch
.
split
(
combined
,
num_channels
,
dim
=
1
)
torch
.
distributed
.
all_gather
(
var_l
,
var_biased
,
process_group
)
count_all
=
count_all
.
view
(
-
1
)
torch
.
distributed
.
all_gather
(
mean
,
var
,
inv_std
=
syncbn
.
welford_parallel
(
mean_all
,
invstd_all
,
count_all
.
to
(
torch
.
int32
),
eps
)
count_l
,
torch
.
cuda
.
IntTensor
([
count
],
device
=
device
),
process_group
)
mean
,
var
,
inv_std
=
syncbn
.
welford_parallel
(
mean_all
,
var_all
,
count_all
,
eps
)
else
:
else
:
device
=
mean
.
device
device
=
mean
.
device
count_all
=
torch
.
cuda
.
IntTensor
([
count
],
device
=
device
)
count_all
=
torch
.
cuda
.
IntTensor
([
count
],
device
=
device
)
inv_std
=
1.0
/
torch
.
sqrt
(
var_biased
+
eps
)
inv_std
=
1.0
/
torch
.
sqrt
(
var_biased
+
eps
)
var
=
var_biased
*
(
count
)
/
(
count
-
1
)
var
=
var_biased
*
(
count
)
/
(
count
-
1
)
if
count
==
1
and
world_size
<
2
:
if
count
==
1
and
world_size
<
2
:
raise
ValueError
(
'Expected more than 1 value per channel when training, got input size{}'
.
format
(
input
.
size
()))
raise
ValueError
(
'Expected more than 1 value per channel when training, got input size{}'
.
format
(
input
.
size
()))
...
@@ -60,7 +58,7 @@ class SyncBatchnormFunction(Function):
...
@@ -60,7 +58,7 @@ class SyncBatchnormFunction(Function):
mean
=
running_mean
.
data
mean
=
running_mean
.
data
inv_std
=
1.0
/
torch
.
sqrt
(
running_variance
.
data
+
eps
)
inv_std
=
1.0
/
torch
.
sqrt
(
running_variance
.
data
+
eps
)
ctx
.
save_for_backward
(
input
,
weight
,
mean
,
inv_std
,
z
,
bias
,
count_all
)
ctx
.
save_for_backward
(
input
,
weight
,
mean
,
inv_std
,
z
,
bias
,
count_all
.
to
(
torch
.
int32
)
)
ctx
.
process_group
=
process_group
ctx
.
process_group
=
process_group
ctx
.
channel_last
=
channel_last
ctx
.
channel_last
=
channel_last
ctx
.
world_size
=
world_size
ctx
.
world_size
=
world_size
...
@@ -101,10 +99,12 @@ class SyncBatchnormFunction(Function):
...
@@ -101,10 +99,12 @@ class SyncBatchnormFunction(Function):
if
ctx
.
needs_input_grad
[
0
]:
if
ctx
.
needs_input_grad
[
0
]:
if
torch
.
distributed
.
is_initialized
():
if
torch
.
distributed
.
is_initialized
():
num_channels
=
sum_dy
.
shape
[
0
]
combined
=
torch
.
cat
([
sum_dy
,
sum_dy_xmu
],
dim
=
0
)
torch
.
distributed
.
all_reduce
(
torch
.
distributed
.
all_reduce
(
sum_dy
,
ReduceOp
.
SUM
,
process_group
)
combined
,
torch
.
distributed
.
ReduceOp
.
SUM
,
process_group
,
async_op
=
False
)
torch
.
distributed
.
all_reduce
(
sum_dy
,
sum_dy_xmu
=
torch
.
split
(
combined
,
num_channels
)
sum_dy_xmu
,
ReduceOp
.
SUM
,
process_group
)
if
channel_last
:
if
channel_last
:
grad_input
=
syncbn
.
batchnorm_backward_c_last
(
grad_output
,
saved_input
,
mean
,
inv_std
,
weight
,
sum_dy
,
sum_dy_xmu
,
count
)
grad_input
=
syncbn
.
batchnorm_backward_c_last
(
grad_output
,
saved_input
,
mean
,
inv_std
,
weight
,
sum_dy
,
sum_dy_xmu
,
count
)
else
:
else
:
...
...
csrc/multi_tensor_apply.cuh
View file @
85b56d01
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
#include <ATen/AccumulateType.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <THC/THC.h>
#include <THC/THC.h>
#include "compat.h"
#include "compat.h"
...
@@ -35,7 +36,7 @@ __global__ void multi_tensor_apply_kernel(
...
@@ -35,7 +36,7 @@ __global__ void multi_tensor_apply_kernel(
ArgTypes
...
args
)
ArgTypes
...
args
)
{
{
// Hand the chunk information to the user-supplied functor to process however it likes.
// Hand the chunk information to the user-supplied functor to process however it likes.
callable
(
chunk_size
,
noop_flag
,
tl
,
args
...);
callable
(
chunk_size
,
noop_flag
,
tl
,
args
...);
}
}
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
...
@@ -50,8 +51,9 @@ void multi_tensor_apply(
...
@@ -50,8 +51,9 @@ void multi_tensor_apply(
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
int
len0
=
tensor_lists
[
0
].
size
();
int
len0
=
tensor_lists
[
0
].
size
();
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
auto
ref_device
=
tensor_lists
[
0
][
0
].
device
();
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
// No range-based for because I need indices
TORCH_CHECK
(
ref_device
.
type
()
==
at
::
kCUDA
,
"expected input to be on cuda"
);
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
// No range-based for because I need indices
{
{
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
...
@@ -62,7 +64,7 @@ void multi_tensor_apply(
...
@@ -62,7 +64,7 @@ void multi_tensor_apply(
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
));
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
));
#endif
#endif
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
is_cuda
(),
"A tensor was not cuda.
"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
device
()
==
ref_device
,
"A tensor was not on the same device as the first tensor
"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
}
}
}
}
...
@@ -71,8 +73,9 @@ void multi_tensor_apply(
...
@@ -71,8 +73,9 @@ void multi_tensor_apply(
TensorListMetadata
<
depth
>
tl
;
TensorListMetadata
<
depth
>
tl
;
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
tensor_lists
[
0
][
0
]));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
tl
.
start_tensor_this_launch
=
0
;
tl
.
start_tensor_this_launch
=
0
;
int
loc_block_info
=
0
;
int
loc_block_info
=
0
;
int
loc_tensor_info
=
0
;
int
loc_tensor_info
=
0
;
...
@@ -98,7 +101,7 @@ void multi_tensor_apply(
...
@@ -98,7 +101,7 @@ void multi_tensor_apply(
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
loc_block_info
++
;
loc_block_info
++
;
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
chunk
==
chunks_this_tensor
-
1
);
chunk
==
chunks_this_tensor
-
1
);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
...
@@ -124,7 +127,7 @@ void multi_tensor_apply(
...
@@ -124,7 +127,7 @@ void multi_tensor_apply(
if
(
chunk
==
chunks_this_tensor
-
1
)
if
(
chunk
==
chunks_this_tensor
-
1
)
{
{
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
loc_tensor_info
=
0
;
loc_tensor_info
=
0
;
tl
.
start_tensor_this_launch
=
t
+
1
;
tl
.
start_tensor_this_launch
=
t
+
1
;
}
}
else
else
...
...
csrc/multi_tensor_l2norm_kernel.cu
View file @
85b56d01
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
#include <ATen/AccumulateType.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
// Another possibility:
// Another possibility:
// #include <torch/all.h>
// #include <torch/all.h>
...
@@ -343,13 +344,13 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
...
@@ -343,13 +344,13 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
max_chunks_per_tensor
);)
max_chunks_per_tensor
);)
AT_CUDA_CHECK
(
cudaGetLastError
());
AT_CUDA_CHECK
(
cudaGetLastError
());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
// This involves one more small kernel launches, but will be negligible end to end.
// This involves one more small kernel launches, but will be negligible end to end.
// I could get rid of these by hacking the functor + multi tensor harness with persistence
// I could get rid of these by hacking the functor + multi tensor harness with persistence
// logic, but keeping it simple for now
// logic, but keeping it simple for now
auto
ret
=
at
::
empty
({
1
},
output
.
options
());
auto
ret
=
at
::
empty
({
1
},
output
.
options
());
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
output
));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cleanup
<<<
per_tensor
?
ntensors
:
1
,
512
,
0
,
stream
>>>
(
cleanup
<<<
per_tensor
?
ntensors
:
1
,
512
,
0
,
stream
>>>
(
output
.
DATA_PTR
<
float
>
(),
output
.
DATA_PTR
<
float
>
(),
...
@@ -377,7 +378,7 @@ void multi_tensor_norm_out_cuda(
...
@@ -377,7 +378,7 @@ void multi_tensor_norm_out_cuda(
const
int
norm_type
)
const
int
norm_type
)
{
{
auto
float_options
=
tensor_lists
[
0
][
0
].
options
().
dtype
(
at
::
kFloat
);
auto
float_options
=
tensor_lists
[
0
][
0
].
options
().
dtype
(
at
::
kFloat
);
TORCH_CHECK
(
tensor_lists
[
0
][
0
].
device
()
==
noop_flag
.
device
(),
"noop flag should be on the same device as tensors"
);
// we don't need global thus uses empty here
// we don't need global thus uses empty here
auto
output
=
at
::
empty
({
320
},
float_options
);
auto
output
=
at
::
empty
({
320
},
float_options
);
...
...
csrc/multi_tensor_sgd_kernel.cu
View file @
85b56d01
...
@@ -160,6 +160,8 @@ void multi_tensor_sgd_cuda(
...
@@ -160,6 +160,8 @@ void multi_tensor_sgd_cuda(
TORCH_CHECK
(
tensor_lists
[
3
][
i
].
scalar_type
()
==
at
::
ScalarType
::
Half
,
TORCH_CHECK
(
tensor_lists
[
3
][
i
].
scalar_type
()
==
at
::
ScalarType
::
Half
,
"Additional output tensors should always be fp16."
);
"Additional output tensors should always be fp16."
);
TORCH_CHECK
(
noop_flag
.
device
()
==
tensor_lists
[
0
][
0
].
device
(),
"expected noop flag to be on the same device as tensors"
);
// We have 3 possibilities to handle here, in terms of
// We have 3 possibilities to handle here, in terms of
// grad_type, param_type, momentum_type, requires_fp16_copy
// grad_type, param_type, momentum_type, requires_fp16_copy
// 1. fp16, fp16, fp16, No
// 1. fp16, fp16, fp16, No
...
...
csrc/welford.cu
View file @
85b56d01
...
@@ -1164,6 +1164,10 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
...
@@ -1164,6 +1164,10 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
at
::
Tensor
inv_std
=
at
::
empty_like
(
out_var
);
at
::
Tensor
inv_std
=
at
::
empty_like
(
out_var
);
at
::
Tensor
out_mean
=
at
::
empty_like
(
out_var
);
at
::
Tensor
out_mean
=
at
::
empty_like
(
out_var
);
at
::
Tensor
mean_feature_nodes_
=
mean_feature_nodes
.
contiguous
();
at
::
Tensor
var_biased_
=
var_biased
.
contiguous
();
at
::
Tensor
numel_
=
numel
.
contiguous
();
// TODO(jie): tile this for memory coalescing!
// TODO(jie): tile this for memory coalescing!
const
int
block
=
std
::
min
(
h_last_pow2
(
feature_size
),
MAX_BLOCK_SIZE
);
const
int
block
=
std
::
min
(
h_last_pow2
(
feature_size
),
MAX_BLOCK_SIZE
);
const
int
grid
=
std
::
max
<
int
>
(
1
,
feature_size
/
block
);
const
int
grid
=
std
::
max
<
int
>
(
1
,
feature_size
/
block
);
...
@@ -1174,9 +1178,9 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
...
@@ -1174,9 +1178,9 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
using
namespace
at
;
using
namespace
at
;
DISPATCH_FLOAT_AND_HALF
(
mean_feature_nodes
.
scalar_type
(),
0
,
"welford_parallel_kernel"
,
DISPATCH_FLOAT_AND_HALF
(
mean_feature_nodes
.
scalar_type
(),
0
,
"welford_parallel_kernel"
,
welford_kernel_parallel
<
scalar_t_0
><<<
grid
,
block
,
0
,
stream
>>>
(
welford_kernel_parallel
<
scalar_t_0
><<<
grid
,
block
,
0
,
stream
>>>
(
mean_feature_nodes
.
DATA_PTR
<
scalar_t_0
>
(),
mean_feature_nodes
_
.
DATA_PTR
<
scalar_t_0
>
(),
var_biased
.
DATA_PTR
<
scalar_t_0
>
(),
var_biased
_
.
DATA_PTR
<
scalar_t_0
>
(),
numel
.
DATA_PTR
<
int
>
(),
numel
_
.
DATA_PTR
<
int
>
(),
out_mean
.
DATA_PTR
<
scalar_t_0
>
(),
out_mean
.
DATA_PTR
<
scalar_t_0
>
(),
out_var
.
DATA_PTR
<
scalar_t_0
>
(),
out_var
.
DATA_PTR
<
scalar_t_0
>
(),
inv_std
.
DATA_PTR
<
scalar_t_0
>
(),
inv_std
.
DATA_PTR
<
scalar_t_0
>
(),
...
...
examples/imagenet/main_amp.py
View file @
85b56d01
...
@@ -182,6 +182,7 @@ def main():
...
@@ -182,6 +182,7 @@ def main():
print
(
"=> loading checkpoint '{}'"
.
format
(
args
.
resume
))
print
(
"=> loading checkpoint '{}'"
.
format
(
args
.
resume
))
checkpoint
=
torch
.
load
(
args
.
resume
,
map_location
=
lambda
storage
,
loc
:
storage
.
cuda
(
args
.
gpu
))
checkpoint
=
torch
.
load
(
args
.
resume
,
map_location
=
lambda
storage
,
loc
:
storage
.
cuda
(
args
.
gpu
))
args
.
start_epoch
=
checkpoint
[
'epoch'
]
args
.
start_epoch
=
checkpoint
[
'epoch'
]
global
best_prec1
best_prec1
=
checkpoint
[
'best_prec1'
]
best_prec1
=
checkpoint
[
'best_prec1'
]
model
.
load_state_dict
(
checkpoint
[
'state_dict'
])
model
.
load_state_dict
(
checkpoint
[
'state_dict'
])
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
...
@@ -527,7 +528,7 @@ def accuracy(output, target, topk=(1,)):
...
@@ -527,7 +528,7 @@ def accuracy(output, target, topk=(1,)):
res
=
[]
res
=
[]
for
k
in
topk
:
for
k
in
topk
:
correct_k
=
correct
[:
k
].
view
(
-
1
).
float
().
sum
(
0
,
keepdim
=
True
)
correct_k
=
correct
[:
k
].
reshape
(
-
1
).
float
().
sum
(
0
,
keepdim
=
True
)
res
.
append
(
correct_k
.
mul_
(
100.0
/
batch_size
))
res
.
append
(
correct_k
.
mul_
(
100.0
/
batch_size
))
return
res
return
res
...
...
setup.py
View file @
85b56d01
This diff is collapsed.
Click to expand it.
tests/L0/run_optimizers/test_adagrad.py
deleted
100644 → 0
View file @
d061bf20
import
unittest
import
apex
import
torch
from
apex.testing.common_utils
import
skipIfRocm
class
TestFusedAdagrad
(
unittest
.
TestCase
):
def
setUp
(
self
,
max_abs_diff
=
1e-6
,
max_rel_diff
=
1
,
iters
=
7
):
self
.
max_abs_diff
=
max_abs_diff
self
.
max_rel_diff
=
max_rel_diff
self
.
iters
=
iters
torch
.
cuda
.
manual_seed
(
9876
)
def
tearDown
(
self
):
pass
def
gen_param_optim
(
self
,
tensors
,
adagrad_option
,
apex_only
=
False
):
ref_param
=
[]
tst_param
=
[]
for
tensor
in
tensors
:
if
apex_only
:
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
().
float
()))
else
:
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
tst_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
if
apex_only
:
ref_optim
=
apex
.
optimizers
.
FusedAdagrad
(
ref_param
,
**
adagrad_option
)
else
:
ref_optim
=
torch
.
optim
.
Adagrad
(
ref_param
,
**
adagrad_option
)
tst_optim
=
apex
.
optimizers
.
FusedAdagrad
(
tst_param
,
**
adagrad_option
)
return
(
ref_param
,
tst_param
,
ref_optim
,
tst_optim
)
def
gen_grad
(
self
,
ref_param
,
tst_param
,
apex_only
=
False
):
for
p_ref
,
p_tst
in
zip
(
ref_param
,
tst_param
):
p_tst
.
grad
=
torch
.
rand_like
(
p_tst
)
p_ref
.
grad
=
p_tst
.
grad
.
detach
().
float
()
if
apex_only
else
p_tst
.
grad
def
gen_mixed_grad
(
self
,
ref_param
,
tst_param
,
scale
=
1.0
):
half_grads
=
[]
for
p_ref
,
_
in
zip
(
ref_param
,
tst_param
):
half_grads
.
append
(
torch
.
rand_like
(
p_ref
).
half
())
p_ref
.
grad
=
half_grads
[
-
1
].
float
()
/
scale
return
half_grads
def
get_max_diff
(
self
,
ref_param
,
tst_param
,
apex_only
=
False
):
max_abs_diff
=
max_rel_diff
=
0
for
p_ref
,
p_tst
in
zip
(
ref_param
,
tst_param
):
if
apex_only
:
p_tst
=
p_tst
.
float
()
max_abs_diff_p
=
(
p_ref
-
p_tst
).
abs
().
max
().
item
()
max_rel_diff_p
=
((
p_ref
-
p_tst
)
/
p_ref
).
abs
().
max
().
item
()
if
max_abs_diff_p
>
max_abs_diff
:
max_abs_diff
=
max_abs_diff_p
if
max_rel_diff_p
>
max_rel_diff
:
max_rel_diff
=
max_rel_diff_p
return
max_abs_diff
,
max_rel_diff
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
apex_only
=
False
):
nelem
=
278011
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
1.0e-5
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
"cuda"
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
[
tensor
],
adagrad_option
,
apex_only
=
apex_only
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
if
not
apex_only
:
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
@
skipIfRocm
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
@
unittest
.
skip
(
"PyTorch optimizer is not numerically correct for fp16"
)
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
# Compares bfloat16 computation against float32 as gold standard.
# Uses apex optimizers(controlled by apex_only flag) for both types.
# Doesn't use upstream optimizer like other tests as they seem to be
# numerically unstable for half types(see skip note for test above).
@
skipIfRocm
def
test_bfloat16
(
self
):
self
.
max_abs_diff
=
1e-2
self
.
gen_single_type_test
(
param_type
=
torch
.
bfloat16
,
apex_only
=
True
)
@
skipIfRocm
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
0
}
tensors
=
[]
for
size
in
sizes
:
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
"cuda"
))
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
tensors
,
adagrad_option
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
def
test_adagrad_option
(
self
):
nelem
=
1
adagrad_option
=
{
"lr"
:
0.01
,
"eps"
:
3e-06
,
"weight_decay"
:
0
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
"cuda"
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
[
tensor
],
adagrad_option
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
tests/L0/run_optimizers/test_dist_adam.py
0 → 100644
View file @
85b56d01
import
argparse
import
random
import
sys
import
torch
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
from
apex
import
amp
from
apex.optimizers
import
FusedAdam
from
apex.contrib.optimizers.distributed_fused_adam
import
DistributedFusedAdam
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
args
):
super
(
TestModel
,
self
).
__init__
()
self
.
linear
=
torch
.
nn
.
Sequential
(
*
[
torch
.
nn
.
Linear
(
args
.
dim
,
args
.
dim
,
bias
=
args
.
bias
)
for
_
in
range
(
args
.
layers
)])
def
forward
(
self
,
x
):
return
self
.
linear
(
x
)
def
setup
(
args
):
## Model
ref_model
=
TestModel
(
args
).
cuda
()
dist_model
=
TestModel
(
args
).
cuda
()
# Same weights
with
torch
.
no_grad
():
for
dp
,
rp
in
zip
(
dist_model
.
parameters
(),
ref_model
.
parameters
()):
dp
.
data
.
copy_
(
rp
.
data
)
dist_model
=
dist_model
.
half
()
## Optimizer
# same hyperparameters
ref_opt_args
=
{
'lr'
:
1e-3
,
'eps'
:
1e-6
,
'weight_decay'
:
0.01
}
ref_opt
=
FusedAdam
(
ref_model
.
parameters
(),
**
ref_opt_args
)
dist_opt_args
=
ref_opt_args
.
copy
()
dist_opt_args
.
update
(
{
'overlap_reductions'
:
False
}
)
dist_opt_args
.
update
(
{
'process_group_size'
:
args
.
n_gpu
}
)
dist_opt_args
.
update
(
{
'dwu_group_size'
:
args
.
dwu_group_size
}
)
dist_opt_args
.
update
(
{
'dwu_num_blocks'
:
1
}
)
dist_opt_args
.
update
(
{
'dwu_num_chunks'
:
1
}
)
dist_opt
=
DistributedFusedAdam
(
dist_model
.
parameters
(),
**
dist_opt_args
)
dist_opt
.
set_global_scale
(
1.
)
## amp-init
amp_args
=
{
'loss_scale'
:
'dynamic'
,
'opt_level'
:
'O2'
}
ref_model
,
ref_opt
=
amp
.
initialize
(
ref_model
,
ref_opt
,
**
amp_args
)
## DDP
ref_model
=
DDP
(
ref_model
,
device_ids
=
[
args
.
rank
])
with
torch
.
no_grad
():
for
dp
in
dist_model
.
parameters
():
torch
.
distributed
.
broadcast
(
dp
.
data
,
src
=
0
)
for
rp
in
ref_model
.
parameters
():
torch
.
distributed
.
broadcast
(
rp
.
data
,
src
=
0
)
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
if
get_rank
()
==
0
:
print
(
f
'dist opt with
{
args
.
n_gpu
}
GPUs'
)
return
ref_model
,
ref_opt
,
dist_model
,
dist_opt
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--local_rank'
,
type
=
int
,
default
=-
1
)
parser
.
add_argument
(
'--steps'
,
type
=
int
,
default
=
20
)
parser
.
add_argument
(
'--batch'
,
type
=
int
,
default
=
32
)
parser
.
add_argument
(
'--dim'
,
type
=
int
,
default
=
4
)
parser
.
add_argument
(
'--layers'
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
'--bias'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--atol'
,
type
=
float
,
default
=
1e-3
)
parser
.
add_argument
(
'--rtol'
,
type
=
float
,
default
=
1
)
parser
.
add_argument
(
'--dwu_group_size'
,
type
=
float
,
default
=
1
)
args
=
parser
.
parse_args
()
return
args
def
setup_env
(
args
):
torch
.
cuda
.
set_device
(
args
.
local_rank
)
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
'env://'
)
args
.
rank
=
torch
.
distributed
.
get_rank
()
args
.
n_gpu
=
torch
.
distributed
.
get_world_size
()
seed
=
42
+
get_rank
()
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
return
args
def
get_rank
():
return
torch
.
distributed
.
get_rank
()
def
main
():
args
=
parse_args
()
args
=
setup_env
(
args
)
tol_args
=
{
'atol'
:
args
.
atol
,
'rtol'
:
args
.
rtol
}
torch
.
set_printoptions
(
precision
=
16
)
ref_model
,
ref_opt
,
dist_model
,
dist_opt
=
setup
(
args
)
# lazy_init not called yet, initialize stash
stash
=
ref_opt
.
_amp_stash
stash
.
all_fp16_params
,
stash
.
all_fp32_from_fp16_params
=
[],
[]
# make sure everything from _first_step_init_ is ready before training
# e.g. registering allreduce_hook
# so that gradients are copied/reduced when necessary
dist_opt
.
_init_everything
()
for
i
in
range
(
args
.
steps
):
x_ref
=
torch
.
randn
(
args
.
batch
,
args
.
dim
,
dtype
=
torch
.
half
).
cuda
().
requires_grad_
(
True
)
x_dist
=
x_ref
.
clone
().
detach
().
requires_grad_
(
True
)
if
get_rank
()
==
0
:
print
(
f
'[
{
i
}
] Checking input'
)
#print("x_ref:", x_ref.flatten()[:10])
#print("x_dist:", x_dist.flatten()[:10])
assert
(
torch
.
allclose
(
x_ref
,
x_dist
,
**
tol_args
))
y_ref
=
ref_model
(
x_ref
).
half
()
y_dist
=
dist_model
(
x_dist
)
if
get_rank
()
==
0
:
print
(
f
'[
{
i
}
] Checking output'
)
#print("y_ref:", y_ref.flatten()[:10])
#print("y_dist:", y_dist.flatten()[:10])
assert
(
torch
.
allclose
(
y_ref
,
y_dist
,
**
tol_args
))
dy
=
torch
.
randn_like
(
y_ref
)
y_ref
.
backward
(
dy
)
y_dist
.
backward
(
dy
)
if
get_rank
()
==
0
:
print
(
f
'[
{
i
}
] Checking gradients'
)
torch
.
distributed
.
barrier
()
torch
.
cuda
.
synchronize
()
assert
(
torch
.
allclose
(
x_ref
.
grad
,
x_dist
.
grad
,
**
tol_args
))
# gradient all-reduce within distributed optimizer
dist_opt
.
complete_reductions
()
if
get_rank
()
==
0
:
print
(
f
'[
{
i
}
] Stepping'
)
ref_opt
.
step
()
dist_opt
.
step
()
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
print
(
'Checking new weights'
)
if
get_rank
()
==
0
:
print
(
"ref param:"
,
ref_model
.
module
.
linear
[
0
].
weight
)
print
(
"dist param:"
,
dist_model
.
linear
[
0
].
weight
)
for
i
,
(
rp
,
dp
)
in
enumerate
(
zip
(
ref_model
.
parameters
(),
dist_model
.
parameters
())):
if
not
torch
.
allclose
(
rp
,
dp
,
**
tol_args
):
if
get_rank
()
==
0
:
print
(
f
'Rank:
{
get_rank
()
}
, Param:
{
i
}
'
)
print
(
f
'ref:
{
rp
.
sum
().
item
()
}
, dist:
{
dp
.
sum
().
item
()
}
'
)
print
(
rp
)
print
(
dp
)
print
(
torch
.
abs
(
rp
-
dp
)
>
tol_args
[
'atol'
])
sys
.
exit
(
0
)
# zero grads
for
rp
,
dp
in
zip
(
ref_model
.
parameters
(),
dist_model
.
parameters
()):
rp
.
grad
=
None
dp
.
grad
=
None
if
__name__
==
"__main__"
:
main
()
tests/L0/run_optimizers/test_
adam
.py
→
tests/L0/run_optimizers/test_
fused_optimizer
.py
View file @
85b56d01
...
@@ -4,10 +4,11 @@ import random
...
@@ -4,10 +4,11 @@ import random
import
torch
import
torch
import
apex
import
apex
from
itertools
import
product
from
apex.testing.common_utils
import
skipIfRocm
from
apex.testing.common_utils
import
skipIfRocm
class
TestFused
Adam
(
unittest
.
TestCase
):
class
TestFused
Optimizer
(
unittest
.
TestCase
):
def
setUp
(
self
,
max_abs_diff
=
1e-3
,
max_rel_diff
=
1
,
iters
=
7
):
def
setUp
(
self
,
max_abs_diff
=
1e-3
,
max_rel_diff
=
1
,
iters
=
7
):
self
.
max_abs_diff
=
max_abs_diff
self
.
max_abs_diff
=
max_abs_diff
self
.
max_rel_diff
=
max_rel_diff
self
.
max_rel_diff
=
max_rel_diff
...
@@ -17,7 +18,7 @@ class TestFusedAdam(unittest.TestCase):
...
@@ -17,7 +18,7 @@ class TestFusedAdam(unittest.TestCase):
def
tearDown
(
self
):
def
tearDown
(
self
):
pass
pass
def
gen_param_optim
(
self
,
tensors
,
adam_
option
,
apex_only
=
False
):
def
gen_param_optim
(
self
,
tensors
,
option
s
,
apex_only
=
False
):
ref_param
=
[]
ref_param
=
[]
tst_param
=
[]
tst_param
=
[]
for
tensor
in
tensors
:
for
tensor
in
tensors
:
...
@@ -28,10 +29,10 @@ class TestFusedAdam(unittest.TestCase):
...
@@ -28,10 +29,10 @@ class TestFusedAdam(unittest.TestCase):
tst_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
tst_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
if
apex_only
:
if
apex_only
:
ref_optim
=
apex
.
optimizers
.
FusedAda
m
(
ref_param
,
**
adam_
option
)
ref_optim
=
self
.
fused_opti
m
(
ref_param
,
**
option
s
)
else
:
else
:
ref_optim
=
torch
.
optim
.
Ada
m
(
ref_param
,
**
adam_
option
)
ref_optim
=
self
.
ref_opti
m
(
ref_param
,
**
option
s
)
tst_optim
=
apex
.
optimizers
.
FusedAda
m
(
tst_param
,
**
adam_
option
)
tst_optim
=
self
.
fused_opti
m
(
tst_param
,
**
option
s
)
return
(
ref_param
,
tst_param
,
ref_optim
,
tst_optim
)
return
(
ref_param
,
tst_param
,
ref_optim
,
tst_optim
)
...
@@ -60,25 +61,32 @@ class TestFusedAdam(unittest.TestCase):
...
@@ -60,25 +61,32 @@ class TestFusedAdam(unittest.TestCase):
return
max_abs_diff
,
max_rel_diff
return
max_abs_diff
,
max_rel_diff
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
apex_only
=
False
):
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
apex_only
=
False
,
device
=
'cuda'
):
nelem
=
278011
nelem
=
278011
adam_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
'cuda'
)
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
device
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
([
tensor
],
adam_
option
,
apex_only
=
apex_only
)
self
.
gen_param_optim
([
tensor
],
self
.
option
s
,
apex_only
=
apex_only
)
for
i
in
range
(
self
.
iters
):
for
i
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
self
.
gen_grad
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
ref_optim
.
step
()
ref_optim
.
step
()
tst_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
if
not
apex_only
:
if
not
apex_only
:
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
class
TestFusedAdam
(
TestFusedOptimizer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
TestFusedAdam
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
options
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
self
.
ref_optim
=
torch
.
optim
.
Adam
self
.
fused_optim
=
apex
.
optimizers
.
FusedAdam
@
skipIfRocm
@
skipIfRocm
def
test_float
(
self
):
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
...
@@ -95,17 +103,23 @@ class TestFusedAdam(unittest.TestCase):
...
@@ -95,17 +103,23 @@ class TestFusedAdam(unittest.TestCase):
self
.
max_abs_diff
=
1e-2
self
.
max_abs_diff
=
1e-2
self
.
gen_single_type_test
(
param_type
=
torch
.
bfloat16
,
apex_only
=
True
)
self
.
gen_single_type_test
(
param_type
=
torch
.
bfloat16
,
apex_only
=
True
)
@
skipIfRocm
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_device
(
self
):
devices
=
(
"cuda:0"
,
"cuda:1"
)
for
current_dev
,
tensor_dev
in
product
(
devices
,
devices
):
with
torch
.
cuda
.
device
(
current_dev
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
,
device
=
tensor_dev
)
@
unittest
.
skip
(
'Disable until 8/1/2019 adam/adamw upstream picked'
)
@
unittest
.
skip
(
'Disable until 8/1/2019 adam/adamw upstream picked'
)
def
test_multi_params
(
self
):
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
adam_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
tensors
=
[]
tensors
=
[]
for
size
in
sizes
:
for
size
in
sizes
:
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
'cuda'
))
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
'cuda'
))
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
(
tensors
,
adam_
option
)
self
.
gen_param_optim
(
tensors
,
self
.
option
s
)
for
i
in
range
(
self
.
iters
):
for
i
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
self
.
gen_grad
(
ref_param
,
tst_param
)
...
@@ -118,12 +132,9 @@ class TestFusedAdam(unittest.TestCase):
...
@@ -118,12 +132,9 @@ class TestFusedAdam(unittest.TestCase):
@
unittest
.
skip
(
'No longer support fuse scaling'
)
@
unittest
.
skip
(
'No longer support fuse scaling'
)
def
test_scale
(
self
):
def
test_scale
(
self
):
nelem
=
278011
nelem
=
278011
adam_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
([
tensor
],
adam_
option
)
self
.
gen_param_optim
([
tensor
],
self
.
option
s
)
for
i
in
range
(
self
.
iters
):
for
i
in
range
(
self
.
iters
):
scale
=
random
.
random
()
*
1000
scale
=
random
.
random
()
*
1000
...
@@ -138,12 +149,10 @@ class TestFusedAdam(unittest.TestCase):
...
@@ -138,12 +149,10 @@ class TestFusedAdam(unittest.TestCase):
@
unittest
.
skip
(
'No longer support output fp16 param'
)
@
unittest
.
skip
(
'No longer support output fp16 param'
)
def
test_fp16_output
(
self
):
def
test_fp16_output
(
self
):
nelem
=
278011
nelem
=
278011
adam_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
([
tensor
],
adam_
option
)
self
.
gen_param_optim
([
tensor
],
self
.
option
s
)
fp16_param
=
torch
.
nn
.
Parameter
(
tensor
.
clone
().
half
())
fp16_param
=
torch
.
nn
.
Parameter
(
tensor
.
clone
().
half
())
...
@@ -180,6 +189,106 @@ class TestFusedAdam(unittest.TestCase):
...
@@ -180,6 +189,106 @@ class TestFusedAdam(unittest.TestCase):
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
class
TestFusedAdagrad
(
TestFusedOptimizer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
TestFusedAdagrad
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
options
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
1.0e-5
}
self
.
ref_optim
=
torch
.
optim
.
Adagrad
self
.
fused_optim
=
apex
.
optimizers
.
FusedAdagrad
@
skipIfRocm
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
@
unittest
.
skip
(
"PyTorch optimizer is not numerically correct for fp16"
)
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
@
skipIfRocm
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_device
(
self
):
devices
=
(
"cuda:0"
,
"cuda:1"
)
for
current_dev
,
tensor_dev
in
product
(
devices
,
devices
):
with
torch
.
cuda
.
device
(
current_dev
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
,
device
=
tensor_dev
)
@
skipIfRocm
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
0
}
tensors
=
[]
for
size
in
sizes
:
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
"cuda"
))
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
tensors
,
adagrad_option
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_params_different_devices_throws
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
0
}
tensors
=
[]
for
i
,
size
in
enumerate
(
sizes
):
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
"cuda:"
+
str
(
i
%
2
)))
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
tensors
,
adagrad_option
)
self
.
gen_grad
(
ref_param
,
tst_param
)
with
self
.
assertRaisesRegex
(
RuntimeError
,
"not on the same device"
):
tst_optim
.
step
()
def
test_adagrad_option
(
self
):
nelem
=
1
adagrad_option
=
{
"lr"
:
0.01
,
"eps"
:
3e-06
,
"weight_decay"
:
0
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
"cuda"
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
[
tensor
],
adagrad_option
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
class
TestFusedSGD
(
TestFusedOptimizer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
TestFusedSGD
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
options
=
{
"lr"
:
.
25
,
"momentum"
:
.
125
}
self
.
ref_optim
=
torch
.
optim
.
SGD
self
.
fused_optim
=
apex
.
optimizers
.
FusedSGD
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_device
(
self
):
devices
=
(
"cuda:0"
,
"cuda:1"
)
for
current_dev
,
tensor_dev
in
product
(
devices
,
devices
):
with
torch
.
cuda
.
device
(
current_dev
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
,
device
=
tensor_dev
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
script_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
unittest
.
main
()
unittest
.
main
()
tests/L0/run_optimizers/test_lamb.py
View file @
85b56d01
...
@@ -6,6 +6,7 @@ from torch.optim import Optimizer
...
@@ -6,6 +6,7 @@ from torch.optim import Optimizer
import
apex
import
apex
from
apex.multi_tensor_apply
import
multi_tensor_applier
from
apex.multi_tensor_apply
import
multi_tensor_applier
from
apex.testing.common_utils
import
skipIfRocm
from
apex.testing.common_utils
import
skipIfRocm
from
itertools
import
product
class
RefLAMB
(
Optimizer
):
class
RefLAMB
(
Optimizer
):
r
"""Implements Lamb algorithm.
r
"""Implements Lamb algorithm.
...
@@ -41,7 +42,7 @@ class RefLAMB(Optimizer):
...
@@ -41,7 +42,7 @@ class RefLAMB(Optimizer):
import
amp_C
import
amp_C
self
.
multi_tensor_l2norm
=
amp_C
.
multi_tensor_l2norm
self
.
multi_tensor_l2norm
=
amp_C
.
multi_tensor_l2norm
# Skip buffer
# Skip buffer
self
.
_dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
]
)
self
.
_dummy_overflow_buf
=
torch
.
tensor
([
0
],
dtype
=
torch
.
int
,
device
=
self
.
param_groups
[
0
][
"params"
][
0
].
device
)
self
.
multi_tensor_lamb
=
amp_C
.
multi_tensor_lamb
self
.
multi_tensor_lamb
=
amp_C
.
multi_tensor_lamb
else
:
else
:
raise
RuntimeError
(
'apex.optimizers.FusedLAMB requires cuda extensions'
)
raise
RuntimeError
(
'apex.optimizers.FusedLAMB requires cuda extensions'
)
...
@@ -69,7 +70,8 @@ class RefLAMB(Optimizer):
...
@@ -69,7 +70,8 @@ class RefLAMB(Optimizer):
else
:
else
:
raise
RuntimeError
(
'FusedLAMB only support fp16 and fp32.'
)
raise
RuntimeError
(
'FusedLAMB only support fp16 and fp32.'
)
g_norm_32
,
g_norm_16
=
torch
.
zeros
(
1
,
device
=
'cuda'
),
torch
.
zeros
(
1
,
device
=
'cuda'
)
device
=
self
.
param_groups
[
0
][
"params"
][
0
].
device
g_norm_32
,
g_norm_16
=
torch
.
zeros
(
1
,
device
=
device
),
torch
.
zeros
(
1
,
device
=
device
)
# compute grad norm for two lists
# compute grad norm for two lists
if
len
(
g_all_32
)
>
0
:
if
len
(
g_all_32
)
>
0
:
g_norm_32
=
multi_tensor_applier
(
self
.
multi_tensor_l2norm
,
g_norm_32
=
multi_tensor_applier
(
self
.
multi_tensor_l2norm
,
...
@@ -85,7 +87,7 @@ class RefLAMB(Optimizer):
...
@@ -85,7 +87,7 @@ class RefLAMB(Optimizer):
self
.
_dummy_overflow_buf
,
self
.
_dummy_overflow_buf
,
[[
g_norm_32
,
g_norm_16
]],
[[
g_norm_32
,
g_norm_16
]],
False
)[
0
]
False
)[
0
]
max_grad_norm
=
1.0
max_grad_norm
=
1.0
clipped_ratio
=
max_grad_norm
/
max
(
global_grad_norm
,
max_grad_norm
)
clipped_ratio
=
max_grad_norm
/
max
(
global_grad_norm
,
max_grad_norm
)
...
@@ -94,7 +96,7 @@ class RefLAMB(Optimizer):
...
@@ -94,7 +96,7 @@ class RefLAMB(Optimizer):
if
p
.
grad
is
None
:
if
p
.
grad
is
None
:
continue
continue
p
.
grad
.
data
*=
clipped_ratio
p
.
grad
.
data
*=
clipped_ratio
grad
=
p
.
grad
.
data
grad
=
p
.
grad
.
data
if
grad
.
is_sparse
:
if
grad
.
is_sparse
:
raise
RuntimeError
(
'Lamb does not support sparse gradients, consider SparseAdam instad.'
)
raise
RuntimeError
(
'Lamb does not support sparse gradients, consider SparseAdam instad.'
)
...
@@ -137,7 +139,7 @@ class RefLAMB(Optimizer):
...
@@ -137,7 +139,7 @@ class RefLAMB(Optimizer):
state
[
'g_norm'
]
=
g_norm
state
[
'g_norm'
]
=
g_norm
state
[
'trust_ratio'
]
=
trust_ratio
state
[
'trust_ratio'
]
=
trust_ratio
step_size
=
group
[
'lr'
]
step_size
=
group
[
'lr'
]
p
.
data
.
add_
(
update
,
alpha
=-
step_size
*
trust_ratio
)
p
.
data
.
add_
(
update
,
alpha
=-
step_size
*
trust_ratio
)
...
@@ -189,11 +191,11 @@ class TestFusedLAMB(unittest.TestCase):
...
@@ -189,11 +191,11 @@ class TestFusedLAMB(unittest.TestCase):
return
max_abs_diff
,
max_rel_diff
return
max_abs_diff
,
max_rel_diff
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
):
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
device
=
"cuda"
):
nelem
=
278011
nelem
=
278011
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
'cuda'
)
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
device
)
weight_decay
=
[
0
,
0.01
]
weight_decay
=
[
0
,
0.01
]
for
wd
in
weight_decay
:
for
wd
in
weight_decay
:
lamb_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
wd
}
lamb_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
wd
}
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
...
@@ -202,7 +204,9 @@ class TestFusedLAMB(unittest.TestCase):
...
@@ -202,7 +204,9 @@ class TestFusedLAMB(unittest.TestCase):
for
i
in
range
(
self
.
iters
):
for
i
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
ref_optim
.
step
()
torch
.
cuda
.
synchronize
()
tst_optim
.
step
()
tst_optim
.
step
()
torch
.
cuda
.
synchronize
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
...
@@ -216,11 +220,19 @@ class TestFusedLAMB(unittest.TestCase):
...
@@ -216,11 +220,19 @@ class TestFusedLAMB(unittest.TestCase):
def
test_half
(
self
):
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
@
skipIfRocm
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_device
(
self
):
devices
=
(
"cuda:0"
,
"cuda:1"
)
for
current_dev
,
tensor_dev
in
product
(
devices
,
devices
):
with
torch
.
cuda
.
device
(
current_dev
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
,
device
=
tensor_dev
)
@
skipIfRocm
@
skipIfRocm
def
test_multi_params
(
self
):
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
weight_decay
=
[
0
,
0.01
]
weight_decay
=
[
0
,
0.01
]
for
wd
in
weight_decay
:
for
wd
in
weight_decay
:
lamb_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
wd
}
lamb_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
wd
}
tensors
=
[]
tensors
=
[]
...
@@ -242,7 +254,7 @@ class TestFusedLAMB(unittest.TestCase):
...
@@ -242,7 +254,7 @@ class TestFusedLAMB(unittest.TestCase):
nelem
=
1
nelem
=
1
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
weight_decay
=
[
0
,
0.01
]
weight_decay
=
[
0
,
0.01
]
for
wd
in
weight_decay
:
for
wd
in
weight_decay
:
lamb_option
=
{
'lr'
:
0.01
,
'betas'
:(
0.6
,
0.9
),
'eps'
:
3e-06
,
'weight_decay'
:
wd
}
lamb_option
=
{
'lr'
:
0.01
,
'betas'
:(
0.6
,
0.9
),
'eps'
:
3e-06
,
'weight_decay'
:
wd
}
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment