Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
85b56d01
Unverified
Commit
85b56d01
authored
Jan 19, 2021
by
Jeff Daily
Committed by
GitHub
Jan 19, 2021
Browse files
Merge pull request #43 from ROCmSoftwarePlatform/IFU-2021-01-18
IFU-2021-01-18
parents
d061bf20
13c8d152
Changes
31
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
494 additions
and
270 deletions
+494
-270
apex/parallel/optimized_sync_batchnorm_kernel.py
apex/parallel/optimized_sync_batchnorm_kernel.py
+18
-18
csrc/multi_tensor_apply.cuh
csrc/multi_tensor_apply.cuh
+10
-7
csrc/multi_tensor_l2norm_kernel.cu
csrc/multi_tensor_l2norm_kernel.cu
+3
-2
csrc/multi_tensor_sgd_kernel.cu
csrc/multi_tensor_sgd_kernel.cu
+2
-0
csrc/welford.cu
csrc/welford.cu
+7
-3
examples/imagenet/main_amp.py
examples/imagenet/main_amp.py
+2
-1
setup.py
setup.py
+116
-73
tests/L0/run_optimizers/test_adagrad.py
tests/L0/run_optimizers/test_adagrad.py
+0
-134
tests/L0/run_optimizers/test_dist_adam.py
tests/L0/run_optimizers/test_dist_adam.py
+183
-0
tests/L0/run_optimizers/test_fused_optimizer.py
tests/L0/run_optimizers/test_fused_optimizer.py
+131
-22
tests/L0/run_optimizers/test_lamb.py
tests/L0/run_optimizers/test_lamb.py
+22
-10
No files found.
apex/parallel/optimized_sync_batchnorm_kernel.py
View file @
85b56d01
...
...
@@ -21,33 +21,31 @@ class SyncBatchnormFunction(Function):
if
channel_last
:
count
=
int
(
input
.
numel
()
/
input
.
size
(
-
1
))
mean
,
var_biased
=
syncbn
.
welford_mean_var_c_last
(
input
)
num_channels
=
input
.
size
(
-
1
)
else
:
count
=
int
(
input
.
numel
()
/
input
.
size
(
1
))
mean
,
var_biased
=
syncbn
.
welford_mean_var
(
input
)
num_channels
=
input
.
size
(
1
)
if
torch
.
distributed
.
is_initialized
():
if
not
process_group
:
process_group
=
torch
.
distributed
.
group
.
WORLD
device
=
mean
.
device
world_size
=
torch
.
distributed
.
get_world_size
(
process_group
)
mean_all
=
torch
.
empty
(
world_size
,
mean
.
size
(
0
),
dtype
=
mean
.
dtype
,
device
=
device
)
var_all
=
torch
.
empty
(
world_size
,
var_biased
.
size
(
0
),
dtype
=
var_biased
.
dtype
,
device
=
device
)
count_all
=
torch
.
cuda
.
IntTensor
(
world_size
,
device
=
device
)
mean_l
=
[
mean_all
.
narrow
(
0
,
i
,
1
)
for
i
in
range
(
world_size
)]
var_l
=
[
var_all
.
narrow
(
0
,
i
,
1
)
for
i
in
range
(
world_size
)]
count_l
=
[
count_all
.
narrow
(
0
,
i
,
1
)
for
i
in
range
(
world_size
)]
torch
.
distributed
.
all_gather
(
mean_l
,
mean
,
process_group
)
torch
.
distributed
.
all_gather
(
var_l
,
var_biased
,
process_group
)
torch
.
distributed
.
all_gather
(
count_l
,
torch
.
cuda
.
IntTensor
([
count
],
device
=
device
),
process_group
)
mean
,
var
,
inv_std
=
syncbn
.
welford_parallel
(
mean_all
,
var_all
,
count_all
,
eps
)
count_t
=
torch
.
empty
(
1
,
dtype
=
mean
.
dtype
,
device
=
mean
.
device
).
fill_
(
count
)
combined
=
torch
.
cat
([
mean
.
view
(
-
1
),
var_biased
.
view
(
-
1
),
count_t
],
dim
=
0
)
combined_list
=
[
torch
.
empty_like
(
combined
)
for
k
in
range
(
world_size
)]
torch
.
distributed
.
all_gather
(
combined_list
,
combined
,
process_group
)
combined
=
torch
.
stack
(
combined_list
,
dim
=
0
)
mean_all
,
invstd_all
,
count_all
=
torch
.
split
(
combined
,
num_channels
,
dim
=
1
)
count_all
=
count_all
.
view
(
-
1
)
mean
,
var
,
inv_std
=
syncbn
.
welford_parallel
(
mean_all
,
invstd_all
,
count_all
.
to
(
torch
.
int32
),
eps
)
else
:
device
=
mean
.
device
count_all
=
torch
.
cuda
.
IntTensor
([
count
],
device
=
device
)
inv_std
=
1.0
/
torch
.
sqrt
(
var_biased
+
eps
)
var
=
var_biased
*
(
count
)
/
(
count
-
1
)
var
=
var_biased
*
(
count
)
/
(
count
-
1
)
if
count
==
1
and
world_size
<
2
:
raise
ValueError
(
'Expected more than 1 value per channel when training, got input size{}'
.
format
(
input
.
size
()))
...
...
@@ -60,7 +58,7 @@ class SyncBatchnormFunction(Function):
mean
=
running_mean
.
data
inv_std
=
1.0
/
torch
.
sqrt
(
running_variance
.
data
+
eps
)
ctx
.
save_for_backward
(
input
,
weight
,
mean
,
inv_std
,
z
,
bias
,
count_all
)
ctx
.
save_for_backward
(
input
,
weight
,
mean
,
inv_std
,
z
,
bias
,
count_all
.
to
(
torch
.
int32
)
)
ctx
.
process_group
=
process_group
ctx
.
channel_last
=
channel_last
ctx
.
world_size
=
world_size
...
...
@@ -101,10 +99,12 @@ class SyncBatchnormFunction(Function):
if
ctx
.
needs_input_grad
[
0
]:
if
torch
.
distributed
.
is_initialized
():
num_channels
=
sum_dy
.
shape
[
0
]
combined
=
torch
.
cat
([
sum_dy
,
sum_dy_xmu
],
dim
=
0
)
torch
.
distributed
.
all_reduce
(
sum_dy
,
ReduceOp
.
SUM
,
process_group
)
torch
.
distributed
.
all_reduce
(
sum_dy_xmu
,
ReduceOp
.
SUM
,
process_group
)
combined
,
torch
.
distributed
.
ReduceOp
.
SUM
,
process_group
,
async_op
=
False
)
sum_dy
,
sum_dy_xmu
=
torch
.
split
(
combined
,
num_channels
)
if
channel_last
:
grad_input
=
syncbn
.
batchnorm_backward_c_last
(
grad_output
,
saved_input
,
mean
,
inv_std
,
weight
,
sum_dy
,
sum_dy_xmu
,
count
)
else
:
...
...
csrc/multi_tensor_apply.cuh
View file @
85b56d01
...
...
@@ -2,6 +2,7 @@
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <THC/THC.h>
#include "compat.h"
...
...
@@ -35,7 +36,7 @@ __global__ void multi_tensor_apply_kernel(
ArgTypes
...
args
)
{
// Hand the chunk information to the user-supplied functor to process however it likes.
callable
(
chunk_size
,
noop_flag
,
tl
,
args
...);
callable
(
chunk_size
,
noop_flag
,
tl
,
args
...);
}
template
<
int
depth
,
typename
T
,
typename
...
ArgTypes
>
...
...
@@ -50,8 +51,9 @@ void multi_tensor_apply(
TORCH_CHECK
(
tensor_lists
.
size
()
==
depth
,
"tensor_lists.size() != depth"
);
int
len0
=
tensor_lists
[
0
].
size
();
TORCH_CHECK
(
len0
>
0
,
"tensor_lists[0].size() is not > 0"
);
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
// No range-based for because I need indices
auto
ref_device
=
tensor_lists
[
0
][
0
].
device
();
TORCH_CHECK
(
ref_device
.
type
()
==
at
::
kCUDA
,
"expected input to be on cuda"
);
for
(
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
// No range-based for because I need indices
{
TORCH_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Size mismatch among tensor lists"
);
for
(
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
...
...
@@ -62,7 +64,7 @@ void multi_tensor_apply(
contiguous_memory
=
(
contiguous_memory
||
tensor_lists
[
l
][
t
].
is_contiguous
(
at
::
MemoryFormat
::
ChannelsLast
));
#endif
TORCH_CHECK
(
contiguous_memory
,
"A tensor was not contiguous."
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
is_cuda
(),
"A tensor was not cuda.
"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
device
()
==
ref_device
,
"A tensor was not on the same device as the first tensor
"
);
TORCH_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Size mismatch"
);
}
}
...
...
@@ -71,8 +73,9 @@ void multi_tensor_apply(
TensorListMetadata
<
depth
>
tl
;
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
tensor_lists
[
0
][
0
]));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
tl
.
start_tensor_this_launch
=
0
;
int
loc_block_info
=
0
;
int
loc_tensor_info
=
0
;
...
...
@@ -98,7 +101,7 @@ void multi_tensor_apply(
tl
.
block_to_tensor
[
loc_block_info
]
=
loc_tensor_info
-
1
;
tl
.
block_to_chunk
[
loc_block_info
]
=
chunk
;
loc_block_info
++
;
bool
tensors_full
=
(
loc_tensor_info
==
depth_to_max_tensors
[
depth
-
1
]
&&
chunk
==
chunks_this_tensor
-
1
);
bool
blocks_full
=
(
loc_block_info
==
depth_to_max_blocks
[
depth
-
1
]);
...
...
@@ -124,7 +127,7 @@ void multi_tensor_apply(
if
(
chunk
==
chunks_this_tensor
-
1
)
{
// std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 << std::endl;
loc_tensor_info
=
0
;
loc_tensor_info
=
0
;
tl
.
start_tensor_this_launch
=
t
+
1
;
}
else
...
...
csrc/multi_tensor_l2norm_kernel.cu
View file @
85b56d01
...
...
@@ -2,6 +2,7 @@
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
// Another possibility:
// #include <torch/all.h>
...
...
@@ -343,13 +344,13 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
max_chunks_per_tensor
);)
AT_CUDA_CHECK
(
cudaGetLastError
());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
// This involves one more small kernel launches, but will be negligible end to end.
// I could get rid of these by hacking the functor + multi tensor harness with persistence
// logic, but keeping it simple for now
auto
ret
=
at
::
empty
({
1
},
output
.
options
());
const
at
::
cuda
::
OptionalCUDAGuard
device_guard
(
device_of
(
output
));
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
cleanup
<<<
per_tensor
?
ntensors
:
1
,
512
,
0
,
stream
>>>
(
output
.
DATA_PTR
<
float
>
(),
...
...
@@ -377,7 +378,7 @@ void multi_tensor_norm_out_cuda(
const
int
norm_type
)
{
auto
float_options
=
tensor_lists
[
0
][
0
].
options
().
dtype
(
at
::
kFloat
);
TORCH_CHECK
(
tensor_lists
[
0
][
0
].
device
()
==
noop_flag
.
device
(),
"noop flag should be on the same device as tensors"
);
// we don't need global thus uses empty here
auto
output
=
at
::
empty
({
320
},
float_options
);
...
...
csrc/multi_tensor_sgd_kernel.cu
View file @
85b56d01
...
...
@@ -160,6 +160,8 @@ void multi_tensor_sgd_cuda(
TORCH_CHECK
(
tensor_lists
[
3
][
i
].
scalar_type
()
==
at
::
ScalarType
::
Half
,
"Additional output tensors should always be fp16."
);
TORCH_CHECK
(
noop_flag
.
device
()
==
tensor_lists
[
0
][
0
].
device
(),
"expected noop flag to be on the same device as tensors"
);
// We have 3 possibilities to handle here, in terms of
// grad_type, param_type, momentum_type, requires_fp16_copy
// 1. fp16, fp16, fp16, No
...
...
csrc/welford.cu
View file @
85b56d01
...
...
@@ -1164,6 +1164,10 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
at
::
Tensor
inv_std
=
at
::
empty_like
(
out_var
);
at
::
Tensor
out_mean
=
at
::
empty_like
(
out_var
);
at
::
Tensor
mean_feature_nodes_
=
mean_feature_nodes
.
contiguous
();
at
::
Tensor
var_biased_
=
var_biased
.
contiguous
();
at
::
Tensor
numel_
=
numel
.
contiguous
();
// TODO(jie): tile this for memory coalescing!
const
int
block
=
std
::
min
(
h_last_pow2
(
feature_size
),
MAX_BLOCK_SIZE
);
const
int
grid
=
std
::
max
<
int
>
(
1
,
feature_size
/
block
);
...
...
@@ -1174,9 +1178,9 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
using
namespace
at
;
DISPATCH_FLOAT_AND_HALF
(
mean_feature_nodes
.
scalar_type
(),
0
,
"welford_parallel_kernel"
,
welford_kernel_parallel
<
scalar_t_0
><<<
grid
,
block
,
0
,
stream
>>>
(
mean_feature_nodes
.
DATA_PTR
<
scalar_t_0
>
(),
var_biased
.
DATA_PTR
<
scalar_t_0
>
(),
numel
.
DATA_PTR
<
int
>
(),
mean_feature_nodes
_
.
DATA_PTR
<
scalar_t_0
>
(),
var_biased
_
.
DATA_PTR
<
scalar_t_0
>
(),
numel
_
.
DATA_PTR
<
int
>
(),
out_mean
.
DATA_PTR
<
scalar_t_0
>
(),
out_var
.
DATA_PTR
<
scalar_t_0
>
(),
inv_std
.
DATA_PTR
<
scalar_t_0
>
(),
...
...
examples/imagenet/main_amp.py
View file @
85b56d01
...
...
@@ -182,6 +182,7 @@ def main():
print
(
"=> loading checkpoint '{}'"
.
format
(
args
.
resume
))
checkpoint
=
torch
.
load
(
args
.
resume
,
map_location
=
lambda
storage
,
loc
:
storage
.
cuda
(
args
.
gpu
))
args
.
start_epoch
=
checkpoint
[
'epoch'
]
global
best_prec1
best_prec1
=
checkpoint
[
'best_prec1'
]
model
.
load_state_dict
(
checkpoint
[
'state_dict'
])
optimizer
.
load_state_dict
(
checkpoint
[
'optimizer'
])
...
...
@@ -527,7 +528,7 @@ def accuracy(output, target, topk=(1,)):
res
=
[]
for
k
in
topk
:
correct_k
=
correct
[:
k
].
view
(
-
1
).
float
().
sum
(
0
,
keepdim
=
True
)
correct_k
=
correct
[:
k
].
reshape
(
-
1
).
float
().
sum
(
0
,
keepdim
=
True
)
res
.
append
(
correct_k
.
mul_
(
100.0
/
batch_size
))
return
res
...
...
setup.py
View file @
85b56d01
This diff is collapsed.
Click to expand it.
tests/L0/run_optimizers/test_adagrad.py
deleted
100644 → 0
View file @
d061bf20
import
unittest
import
apex
import
torch
from
apex.testing.common_utils
import
skipIfRocm
class
TestFusedAdagrad
(
unittest
.
TestCase
):
def
setUp
(
self
,
max_abs_diff
=
1e-6
,
max_rel_diff
=
1
,
iters
=
7
):
self
.
max_abs_diff
=
max_abs_diff
self
.
max_rel_diff
=
max_rel_diff
self
.
iters
=
iters
torch
.
cuda
.
manual_seed
(
9876
)
def
tearDown
(
self
):
pass
def
gen_param_optim
(
self
,
tensors
,
adagrad_option
,
apex_only
=
False
):
ref_param
=
[]
tst_param
=
[]
for
tensor
in
tensors
:
if
apex_only
:
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
().
float
()))
else
:
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
tst_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
if
apex_only
:
ref_optim
=
apex
.
optimizers
.
FusedAdagrad
(
ref_param
,
**
adagrad_option
)
else
:
ref_optim
=
torch
.
optim
.
Adagrad
(
ref_param
,
**
adagrad_option
)
tst_optim
=
apex
.
optimizers
.
FusedAdagrad
(
tst_param
,
**
adagrad_option
)
return
(
ref_param
,
tst_param
,
ref_optim
,
tst_optim
)
def
gen_grad
(
self
,
ref_param
,
tst_param
,
apex_only
=
False
):
for
p_ref
,
p_tst
in
zip
(
ref_param
,
tst_param
):
p_tst
.
grad
=
torch
.
rand_like
(
p_tst
)
p_ref
.
grad
=
p_tst
.
grad
.
detach
().
float
()
if
apex_only
else
p_tst
.
grad
def
gen_mixed_grad
(
self
,
ref_param
,
tst_param
,
scale
=
1.0
):
half_grads
=
[]
for
p_ref
,
_
in
zip
(
ref_param
,
tst_param
):
half_grads
.
append
(
torch
.
rand_like
(
p_ref
).
half
())
p_ref
.
grad
=
half_grads
[
-
1
].
float
()
/
scale
return
half_grads
def
get_max_diff
(
self
,
ref_param
,
tst_param
,
apex_only
=
False
):
max_abs_diff
=
max_rel_diff
=
0
for
p_ref
,
p_tst
in
zip
(
ref_param
,
tst_param
):
if
apex_only
:
p_tst
=
p_tst
.
float
()
max_abs_diff_p
=
(
p_ref
-
p_tst
).
abs
().
max
().
item
()
max_rel_diff_p
=
((
p_ref
-
p_tst
)
/
p_ref
).
abs
().
max
().
item
()
if
max_abs_diff_p
>
max_abs_diff
:
max_abs_diff
=
max_abs_diff_p
if
max_rel_diff_p
>
max_rel_diff
:
max_rel_diff
=
max_rel_diff_p
return
max_abs_diff
,
max_rel_diff
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
apex_only
=
False
):
nelem
=
278011
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
1.0e-5
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
"cuda"
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
[
tensor
],
adagrad_option
,
apex_only
=
apex_only
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
if
not
apex_only
:
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
@
skipIfRocm
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
@
unittest
.
skip
(
"PyTorch optimizer is not numerically correct for fp16"
)
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
# Compares bfloat16 computation against float32 as gold standard.
# Uses apex optimizers(controlled by apex_only flag) for both types.
# Doesn't use upstream optimizer like other tests as they seem to be
# numerically unstable for half types(see skip note for test above).
@
skipIfRocm
def
test_bfloat16
(
self
):
self
.
max_abs_diff
=
1e-2
self
.
gen_single_type_test
(
param_type
=
torch
.
bfloat16
,
apex_only
=
True
)
@
skipIfRocm
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
0
}
tensors
=
[]
for
size
in
sizes
:
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
"cuda"
))
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
tensors
,
adagrad_option
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
def
test_adagrad_option
(
self
):
nelem
=
1
adagrad_option
=
{
"lr"
:
0.01
,
"eps"
:
3e-06
,
"weight_decay"
:
0
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
"cuda"
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
[
tensor
],
adagrad_option
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
tests/L0/run_optimizers/test_dist_adam.py
0 → 100644
View file @
85b56d01
import
argparse
import
random
import
sys
import
torch
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
from
apex
import
amp
from
apex.optimizers
import
FusedAdam
from
apex.contrib.optimizers.distributed_fused_adam
import
DistributedFusedAdam
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
args
):
super
(
TestModel
,
self
).
__init__
()
self
.
linear
=
torch
.
nn
.
Sequential
(
*
[
torch
.
nn
.
Linear
(
args
.
dim
,
args
.
dim
,
bias
=
args
.
bias
)
for
_
in
range
(
args
.
layers
)])
def
forward
(
self
,
x
):
return
self
.
linear
(
x
)
def
setup
(
args
):
## Model
ref_model
=
TestModel
(
args
).
cuda
()
dist_model
=
TestModel
(
args
).
cuda
()
# Same weights
with
torch
.
no_grad
():
for
dp
,
rp
in
zip
(
dist_model
.
parameters
(),
ref_model
.
parameters
()):
dp
.
data
.
copy_
(
rp
.
data
)
dist_model
=
dist_model
.
half
()
## Optimizer
# same hyperparameters
ref_opt_args
=
{
'lr'
:
1e-3
,
'eps'
:
1e-6
,
'weight_decay'
:
0.01
}
ref_opt
=
FusedAdam
(
ref_model
.
parameters
(),
**
ref_opt_args
)
dist_opt_args
=
ref_opt_args
.
copy
()
dist_opt_args
.
update
(
{
'overlap_reductions'
:
False
}
)
dist_opt_args
.
update
(
{
'process_group_size'
:
args
.
n_gpu
}
)
dist_opt_args
.
update
(
{
'dwu_group_size'
:
args
.
dwu_group_size
}
)
dist_opt_args
.
update
(
{
'dwu_num_blocks'
:
1
}
)
dist_opt_args
.
update
(
{
'dwu_num_chunks'
:
1
}
)
dist_opt
=
DistributedFusedAdam
(
dist_model
.
parameters
(),
**
dist_opt_args
)
dist_opt
.
set_global_scale
(
1.
)
## amp-init
amp_args
=
{
'loss_scale'
:
'dynamic'
,
'opt_level'
:
'O2'
}
ref_model
,
ref_opt
=
amp
.
initialize
(
ref_model
,
ref_opt
,
**
amp_args
)
## DDP
ref_model
=
DDP
(
ref_model
,
device_ids
=
[
args
.
rank
])
with
torch
.
no_grad
():
for
dp
in
dist_model
.
parameters
():
torch
.
distributed
.
broadcast
(
dp
.
data
,
src
=
0
)
for
rp
in
ref_model
.
parameters
():
torch
.
distributed
.
broadcast
(
rp
.
data
,
src
=
0
)
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
if
get_rank
()
==
0
:
print
(
f
'dist opt with
{
args
.
n_gpu
}
GPUs'
)
return
ref_model
,
ref_opt
,
dist_model
,
dist_opt
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--local_rank'
,
type
=
int
,
default
=-
1
)
parser
.
add_argument
(
'--steps'
,
type
=
int
,
default
=
20
)
parser
.
add_argument
(
'--batch'
,
type
=
int
,
default
=
32
)
parser
.
add_argument
(
'--dim'
,
type
=
int
,
default
=
4
)
parser
.
add_argument
(
'--layers'
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
'--bias'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--atol'
,
type
=
float
,
default
=
1e-3
)
parser
.
add_argument
(
'--rtol'
,
type
=
float
,
default
=
1
)
parser
.
add_argument
(
'--dwu_group_size'
,
type
=
float
,
default
=
1
)
args
=
parser
.
parse_args
()
return
args
def
setup_env
(
args
):
torch
.
cuda
.
set_device
(
args
.
local_rank
)
torch
.
distributed
.
init_process_group
(
backend
=
'nccl'
,
init_method
=
'env://'
)
args
.
rank
=
torch
.
distributed
.
get_rank
()
args
.
n_gpu
=
torch
.
distributed
.
get_world_size
()
seed
=
42
+
get_rank
()
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
return
args
def
get_rank
():
return
torch
.
distributed
.
get_rank
()
def
main
():
args
=
parse_args
()
args
=
setup_env
(
args
)
tol_args
=
{
'atol'
:
args
.
atol
,
'rtol'
:
args
.
rtol
}
torch
.
set_printoptions
(
precision
=
16
)
ref_model
,
ref_opt
,
dist_model
,
dist_opt
=
setup
(
args
)
# lazy_init not called yet, initialize stash
stash
=
ref_opt
.
_amp_stash
stash
.
all_fp16_params
,
stash
.
all_fp32_from_fp16_params
=
[],
[]
# make sure everything from _first_step_init_ is ready before training
# e.g. registering allreduce_hook
# so that gradients are copied/reduced when necessary
dist_opt
.
_init_everything
()
for
i
in
range
(
args
.
steps
):
x_ref
=
torch
.
randn
(
args
.
batch
,
args
.
dim
,
dtype
=
torch
.
half
).
cuda
().
requires_grad_
(
True
)
x_dist
=
x_ref
.
clone
().
detach
().
requires_grad_
(
True
)
if
get_rank
()
==
0
:
print
(
f
'[
{
i
}
] Checking input'
)
#print("x_ref:", x_ref.flatten()[:10])
#print("x_dist:", x_dist.flatten()[:10])
assert
(
torch
.
allclose
(
x_ref
,
x_dist
,
**
tol_args
))
y_ref
=
ref_model
(
x_ref
).
half
()
y_dist
=
dist_model
(
x_dist
)
if
get_rank
()
==
0
:
print
(
f
'[
{
i
}
] Checking output'
)
#print("y_ref:", y_ref.flatten()[:10])
#print("y_dist:", y_dist.flatten()[:10])
assert
(
torch
.
allclose
(
y_ref
,
y_dist
,
**
tol_args
))
dy
=
torch
.
randn_like
(
y_ref
)
y_ref
.
backward
(
dy
)
y_dist
.
backward
(
dy
)
if
get_rank
()
==
0
:
print
(
f
'[
{
i
}
] Checking gradients'
)
torch
.
distributed
.
barrier
()
torch
.
cuda
.
synchronize
()
assert
(
torch
.
allclose
(
x_ref
.
grad
,
x_dist
.
grad
,
**
tol_args
))
# gradient all-reduce within distributed optimizer
dist_opt
.
complete_reductions
()
if
get_rank
()
==
0
:
print
(
f
'[
{
i
}
] Stepping'
)
ref_opt
.
step
()
dist_opt
.
step
()
torch
.
cuda
.
synchronize
()
torch
.
distributed
.
barrier
()
print
(
'Checking new weights'
)
if
get_rank
()
==
0
:
print
(
"ref param:"
,
ref_model
.
module
.
linear
[
0
].
weight
)
print
(
"dist param:"
,
dist_model
.
linear
[
0
].
weight
)
for
i
,
(
rp
,
dp
)
in
enumerate
(
zip
(
ref_model
.
parameters
(),
dist_model
.
parameters
())):
if
not
torch
.
allclose
(
rp
,
dp
,
**
tol_args
):
if
get_rank
()
==
0
:
print
(
f
'Rank:
{
get_rank
()
}
, Param:
{
i
}
'
)
print
(
f
'ref:
{
rp
.
sum
().
item
()
}
, dist:
{
dp
.
sum
().
item
()
}
'
)
print
(
rp
)
print
(
dp
)
print
(
torch
.
abs
(
rp
-
dp
)
>
tol_args
[
'atol'
])
sys
.
exit
(
0
)
# zero grads
for
rp
,
dp
in
zip
(
ref_model
.
parameters
(),
dist_model
.
parameters
()):
rp
.
grad
=
None
dp
.
grad
=
None
if
__name__
==
"__main__"
:
main
()
tests/L0/run_optimizers/test_
adam
.py
→
tests/L0/run_optimizers/test_
fused_optimizer
.py
View file @
85b56d01
...
...
@@ -4,10 +4,11 @@ import random
import
torch
import
apex
from
itertools
import
product
from
apex.testing.common_utils
import
skipIfRocm
class
TestFused
Adam
(
unittest
.
TestCase
):
class
TestFused
Optimizer
(
unittest
.
TestCase
):
def
setUp
(
self
,
max_abs_diff
=
1e-3
,
max_rel_diff
=
1
,
iters
=
7
):
self
.
max_abs_diff
=
max_abs_diff
self
.
max_rel_diff
=
max_rel_diff
...
...
@@ -17,7 +18,7 @@ class TestFusedAdam(unittest.TestCase):
def
tearDown
(
self
):
pass
def
gen_param_optim
(
self
,
tensors
,
adam_
option
,
apex_only
=
False
):
def
gen_param_optim
(
self
,
tensors
,
option
s
,
apex_only
=
False
):
ref_param
=
[]
tst_param
=
[]
for
tensor
in
tensors
:
...
...
@@ -28,10 +29,10 @@ class TestFusedAdam(unittest.TestCase):
tst_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
if
apex_only
:
ref_optim
=
apex
.
optimizers
.
FusedAda
m
(
ref_param
,
**
adam_
option
)
ref_optim
=
self
.
fused_opti
m
(
ref_param
,
**
option
s
)
else
:
ref_optim
=
torch
.
optim
.
Ada
m
(
ref_param
,
**
adam_
option
)
tst_optim
=
apex
.
optimizers
.
FusedAda
m
(
tst_param
,
**
adam_
option
)
ref_optim
=
self
.
ref_opti
m
(
ref_param
,
**
option
s
)
tst_optim
=
self
.
fused_opti
m
(
tst_param
,
**
option
s
)
return
(
ref_param
,
tst_param
,
ref_optim
,
tst_optim
)
...
...
@@ -60,25 +61,32 @@ class TestFusedAdam(unittest.TestCase):
return
max_abs_diff
,
max_rel_diff
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
apex_only
=
False
):
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
apex_only
=
False
,
device
=
'cuda'
):
nelem
=
278011
adam_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
'cuda'
)
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
device
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
([
tensor
],
adam_
option
,
apex_only
=
apex_only
)
self
.
gen_param_optim
([
tensor
],
self
.
option
s
,
apex_only
=
apex_only
)
for
i
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
if
not
apex_only
:
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
class
TestFusedAdam
(
TestFusedOptimizer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
TestFusedAdam
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
options
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
self
.
ref_optim
=
torch
.
optim
.
Adam
self
.
fused_optim
=
apex
.
optimizers
.
FusedAdam
@
skipIfRocm
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
...
...
@@ -95,17 +103,23 @@ class TestFusedAdam(unittest.TestCase):
self
.
max_abs_diff
=
1e-2
self
.
gen_single_type_test
(
param_type
=
torch
.
bfloat16
,
apex_only
=
True
)
@
skipIfRocm
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_device
(
self
):
devices
=
(
"cuda:0"
,
"cuda:1"
)
for
current_dev
,
tensor_dev
in
product
(
devices
,
devices
):
with
torch
.
cuda
.
device
(
current_dev
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
,
device
=
tensor_dev
)
@
unittest
.
skip
(
'Disable until 8/1/2019 adam/adamw upstream picked'
)
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
adam_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
tensors
=
[]
for
size
in
sizes
:
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
'cuda'
))
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
(
tensors
,
adam_
option
)
self
.
gen_param_optim
(
tensors
,
self
.
option
s
)
for
i
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
...
...
@@ -118,12 +132,9 @@ class TestFusedAdam(unittest.TestCase):
@
unittest
.
skip
(
'No longer support fuse scaling'
)
def
test_scale
(
self
):
nelem
=
278011
adam_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
([
tensor
],
adam_
option
)
self
.
gen_param_optim
([
tensor
],
self
.
option
s
)
for
i
in
range
(
self
.
iters
):
scale
=
random
.
random
()
*
1000
...
...
@@ -138,12 +149,10 @@ class TestFusedAdam(unittest.TestCase):
@
unittest
.
skip
(
'No longer support output fp16 param'
)
def
test_fp16_output
(
self
):
nelem
=
278011
adam_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
([
tensor
],
adam_
option
)
self
.
gen_param_optim
([
tensor
],
self
.
option
s
)
fp16_param
=
torch
.
nn
.
Parameter
(
tensor
.
clone
().
half
())
...
...
@@ -180,6 +189,106 @@ class TestFusedAdam(unittest.TestCase):
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
class
TestFusedAdagrad
(
TestFusedOptimizer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
TestFusedAdagrad
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
options
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
1.0e-5
}
self
.
ref_optim
=
torch
.
optim
.
Adagrad
self
.
fused_optim
=
apex
.
optimizers
.
FusedAdagrad
@
skipIfRocm
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
@
unittest
.
skip
(
"PyTorch optimizer is not numerically correct for fp16"
)
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
@
skipIfRocm
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_device
(
self
):
devices
=
(
"cuda:0"
,
"cuda:1"
)
for
current_dev
,
tensor_dev
in
product
(
devices
,
devices
):
with
torch
.
cuda
.
device
(
current_dev
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
,
device
=
tensor_dev
)
@
skipIfRocm
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
0
}
tensors
=
[]
for
size
in
sizes
:
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
"cuda"
))
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
tensors
,
adagrad_option
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_params_different_devices_throws
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
0
}
tensors
=
[]
for
i
,
size
in
enumerate
(
sizes
):
tensors
.
append
(
torch
.
rand
(
size
,
dtype
=
torch
.
float
,
device
=
"cuda:"
+
str
(
i
%
2
)))
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
tensors
,
adagrad_option
)
self
.
gen_grad
(
ref_param
,
tst_param
)
with
self
.
assertRaisesRegex
(
RuntimeError
,
"not on the same device"
):
tst_optim
.
step
()
def
test_adagrad_option
(
self
):
nelem
=
1
adagrad_option
=
{
"lr"
:
0.01
,
"eps"
:
3e-06
,
"weight_decay"
:
0
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
"cuda"
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
[
tensor
],
adagrad_option
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
class
TestFusedSGD
(
TestFusedOptimizer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
TestFusedSGD
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
options
=
{
"lr"
:
.
25
,
"momentum"
:
.
125
}
self
.
ref_optim
=
torch
.
optim
.
SGD
self
.
fused_optim
=
apex
.
optimizers
.
FusedSGD
def
test_float
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
)
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_device
(
self
):
devices
=
(
"cuda:0"
,
"cuda:1"
)
for
current_dev
,
tensor_dev
in
product
(
devices
,
devices
):
with
torch
.
cuda
.
device
(
current_dev
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
,
device
=
tensor_dev
)
if
__name__
==
'__main__'
:
script_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
unittest
.
main
()
tests/L0/run_optimizers/test_lamb.py
View file @
85b56d01
...
...
@@ -6,6 +6,7 @@ from torch.optim import Optimizer
import
apex
from
apex.multi_tensor_apply
import
multi_tensor_applier
from
apex.testing.common_utils
import
skipIfRocm
from
itertools
import
product
class
RefLAMB
(
Optimizer
):
r
"""Implements Lamb algorithm.
...
...
@@ -41,7 +42,7 @@ class RefLAMB(Optimizer):
import
amp_C
self
.
multi_tensor_l2norm
=
amp_C
.
multi_tensor_l2norm
# Skip buffer
self
.
_dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
]
)
self
.
_dummy_overflow_buf
=
torch
.
tensor
([
0
],
dtype
=
torch
.
int
,
device
=
self
.
param_groups
[
0
][
"params"
][
0
].
device
)
self
.
multi_tensor_lamb
=
amp_C
.
multi_tensor_lamb
else
:
raise
RuntimeError
(
'apex.optimizers.FusedLAMB requires cuda extensions'
)
...
...
@@ -69,7 +70,8 @@ class RefLAMB(Optimizer):
else
:
raise
RuntimeError
(
'FusedLAMB only support fp16 and fp32.'
)
g_norm_32
,
g_norm_16
=
torch
.
zeros
(
1
,
device
=
'cuda'
),
torch
.
zeros
(
1
,
device
=
'cuda'
)
device
=
self
.
param_groups
[
0
][
"params"
][
0
].
device
g_norm_32
,
g_norm_16
=
torch
.
zeros
(
1
,
device
=
device
),
torch
.
zeros
(
1
,
device
=
device
)
# compute grad norm for two lists
if
len
(
g_all_32
)
>
0
:
g_norm_32
=
multi_tensor_applier
(
self
.
multi_tensor_l2norm
,
...
...
@@ -85,7 +87,7 @@ class RefLAMB(Optimizer):
self
.
_dummy_overflow_buf
,
[[
g_norm_32
,
g_norm_16
]],
False
)[
0
]
max_grad_norm
=
1.0
clipped_ratio
=
max_grad_norm
/
max
(
global_grad_norm
,
max_grad_norm
)
...
...
@@ -94,7 +96,7 @@ class RefLAMB(Optimizer):
if
p
.
grad
is
None
:
continue
p
.
grad
.
data
*=
clipped_ratio
grad
=
p
.
grad
.
data
grad
=
p
.
grad
.
data
if
grad
.
is_sparse
:
raise
RuntimeError
(
'Lamb does not support sparse gradients, consider SparseAdam instad.'
)
...
...
@@ -137,7 +139,7 @@ class RefLAMB(Optimizer):
state
[
'g_norm'
]
=
g_norm
state
[
'trust_ratio'
]
=
trust_ratio
step_size
=
group
[
'lr'
]
step_size
=
group
[
'lr'
]
p
.
data
.
add_
(
update
,
alpha
=-
step_size
*
trust_ratio
)
...
...
@@ -189,11 +191,11 @@ class TestFusedLAMB(unittest.TestCase):
return
max_abs_diff
,
max_rel_diff
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
):
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
device
=
"cuda"
):
nelem
=
278011
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
'cuda'
)
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
device
)
weight_decay
=
[
0
,
0.01
]
for
wd
in
weight_decay
:
lamb_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
wd
}
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
...
...
@@ -202,7 +204,9 @@ class TestFusedLAMB(unittest.TestCase):
for
i
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
ref_optim
.
step
()
torch
.
cuda
.
synchronize
()
tst_optim
.
step
()
torch
.
cuda
.
synchronize
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
...
...
@@ -216,11 +220,19 @@ class TestFusedLAMB(unittest.TestCase):
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
@
skipIfRocm
@
unittest
.
skipIf
(
torch
.
cuda
.
device_count
()
<
2
,
"more than 1 GPU required"
)
def
test_multi_device
(
self
):
devices
=
(
"cuda:0"
,
"cuda:1"
)
for
current_dev
,
tensor_dev
in
product
(
devices
,
devices
):
with
torch
.
cuda
.
device
(
current_dev
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float
,
device
=
tensor_dev
)
@
skipIfRocm
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
weight_decay
=
[
0
,
0.01
]
for
wd
in
weight_decay
:
lamb_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
wd
}
tensors
=
[]
...
...
@@ -242,7 +254,7 @@ class TestFusedLAMB(unittest.TestCase):
nelem
=
1
tensor
=
torch
.
rand
(
nelem
,
dtype
=
torch
.
float
,
device
=
'cuda'
)
weight_decay
=
[
0
,
0.01
]
for
wd
in
weight_decay
:
lamb_option
=
{
'lr'
:
0.01
,
'betas'
:(
0.6
,
0.9
),
'eps'
:
3e-06
,
'weight_decay'
:
wd
}
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment