Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
85549903
Commit
85549903
authored
May 26, 2020
by
rohithkrn
Browse files
enable bfloat16 for optimizers
parent
5cfdc014
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
69 additions
and
35 deletions
+69
-35
apex/optimizers/fused_adagrad.py
apex/optimizers/fused_adagrad.py
+2
-2
apex/optimizers/fused_adam.py
apex/optimizers/fused_adam.py
+2
-2
apex/optimizers/fused_lamb.py
apex/optimizers/fused_lamb.py
+2
-2
apex/optimizers/fused_novograd.py
apex/optimizers/fused_novograd.py
+2
-2
csrc/layer_norm_cuda_kernel.cu
csrc/layer_norm_cuda_kernel.cu
+2
-2
csrc/multi_tensor_adagrad.cu
csrc/multi_tensor_adagrad.cu
+1
-1
tests/L0/run_optimizers/test_adagrad.py
tests/L0/run_optimizers/test_adagrad.py
+29
-12
tests/L0/run_optimizers/test_adam.py
tests/L0/run_optimizers/test_adam.py
+29
-12
No files found.
apex/optimizers/fused_adagrad.py
View file @
85549903
...
...
@@ -91,7 +91,7 @@ class FusedAdagrad(torch.optim.Optimizer):
if
len
(
state
)
==
0
:
# Exponential moving average of gradient values
state
[
'sum'
]
=
torch
.
zeros_like
(
p
.
data
)
if
p
.
dtype
==
torch
.
float16
:
if
p
.
dtype
in
{
torch
.
float16
,
torch
.
bfloat16
}
:
g_16
.
append
(
p
.
grad
.
data
)
p_16
.
append
(
p
.
data
)
h_16
.
append
(
state
[
'sum'
])
...
...
@@ -100,7 +100,7 @@ class FusedAdagrad(torch.optim.Optimizer):
p_32
.
append
(
p
.
data
)
h_32
.
append
(
state
[
'sum'
])
else
:
raise
RuntimeError
(
'FusedAdagrad only support fp16 and fp32.'
)
raise
RuntimeError
(
'FusedAdagrad only support fp16
, bfloat16
and fp32.'
)
if
(
len
(
g_16
)
>
0
):
multi_tensor_applier
(
self
.
multi_tensor_adagrad
,
...
...
apex/optimizers/fused_adam.py
View file @
85549903
...
...
@@ -130,7 +130,7 @@ class FusedAdam(torch.optim.Optimizer):
# Exponential moving average of squared gradient values
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
if
p
.
dtype
==
torch
.
float16
:
if
p
.
dtype
in
{
torch
.
float16
,
torch
.
bfloat16
}
:
g_16
.
append
(
p
.
grad
.
data
)
p_16
.
append
(
p
.
data
)
m_16
.
append
(
state
[
'exp_avg'
])
...
...
@@ -141,7 +141,7 @@ class FusedAdam(torch.optim.Optimizer):
m_32
.
append
(
state
[
'exp_avg'
])
v_32
.
append
(
state
[
'exp_avg_sq'
])
else
:
raise
RuntimeError
(
'FusedAdam only support fp16 and fp32.'
)
raise
RuntimeError
(
'FusedAdam only support fp16
, bfloat16
and fp32.'
)
if
(
len
(
g_16
)
>
0
):
multi_tensor_applier
(
self
.
multi_tensor_adam
,
...
...
apex/optimizers/fused_lamb.py
View file @
85549903
...
...
@@ -130,7 +130,7 @@ class FusedLAMB(torch.optim.Optimizer):
# Exponential moving average of gradient values
state
[
'exp_avg_sq'
]
=
torch
.
zeros_like
(
p
.
data
)
if
p
.
dtype
==
torch
.
float16
:
if
p
.
dtype
in
{
torch
.
float16
,
torch
.
bfloat16
}
:
g_16
.
append
(
p
.
grad
.
data
)
p_16
.
append
(
p
.
data
)
m_16
.
append
(
state
[
'exp_avg'
])
...
...
@@ -141,7 +141,7 @@ class FusedLAMB(torch.optim.Optimizer):
m_32
.
append
(
state
[
'exp_avg'
])
v_32
.
append
(
state
[
'exp_avg_sq'
])
else
:
raise
RuntimeError
(
'FusedLAMB only support fp16 and fp32.'
)
raise
RuntimeError
(
'FusedLAMB only support fp16
, bfloat16
and fp32.'
)
if
(
len
(
g_16
)
>
0
):
multi_tensor_applier
(
self
.
multi_tensor_lamb
,
...
...
apex/optimizers/fused_novograd.py
View file @
85549903
...
...
@@ -142,7 +142,7 @@ class FusedNovoGrad(torch.optim.Optimizer):
# Exponential moving average of gradient values
state
[
'exp_avg'
]
=
torch
.
zeros_like
(
p
.
data
)
if
p
.
dtype
==
torch
.
float16
:
if
p
.
dtype
in
{
torch
.
float16
,
torch
.
bfloat16
}
:
g_16
.
append
(
p
.
grad
.
data
)
p_16
.
append
(
p
.
data
)
m_16
.
append
(
state
[
'exp_avg'
])
...
...
@@ -151,7 +151,7 @@ class FusedNovoGrad(torch.optim.Optimizer):
p_32
.
append
(
p
.
data
)
m_32
.
append
(
state
[
'exp_avg'
])
else
:
raise
RuntimeError
(
'FusedNovoGrad only support fp16 and fp32.'
)
raise
RuntimeError
(
'FusedNovoGrad only support fp16
, bfloat16
and fp32.'
)
# we store per weight norm as one tensor for one group/precision combination
# different from optim.Adam, we store norm here(not ^2) so we can unify calculation for norm types
...
...
csrc/layer_norm_cuda_kernel.cu
View file @
85549903
...
...
@@ -690,7 +690,7 @@ void cuda_layer_norm(
double
epsilon
)
{
using
namespace
at
;
DISPATCH_DOUBLE_FLOAT_AND_HALF
(
input
->
scalar_type
(),
0
,
"layer_norm_cuda_kernel"
,
DISPATCH_DOUBLE_FLOAT_AND_HALF
_AND_BFLOAT16
(
input
->
scalar_type
(),
0
,
"layer_norm_cuda_kernel"
,
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
HostApplyLayerNorm
(
output
->
DATA_PTR
<
scalar_t_0
>
(),
...
...
@@ -793,7 +793,7 @@ void cuda_layer_norm_gradient(
at
::
Tensor
*
grad_beta
)
{
using
namespace
at
;
DISPATCH_FLOAT_AND_HALF
(
input
->
scalar_type
(),
0
,
"cuComputeGradInput"
,
DISPATCH_FLOAT_AND_HALF
_AND_BFLOAT16
(
input
->
scalar_type
(),
0
,
"cuComputeGradInput"
,
using
accscalar_t
=
at
::
acc_type
<
scalar_t_0
,
true
>
;
HostLayerNormGradient
(
dout
->
DATA_PTR
<
scalar_t_0
>
(),
...
...
csrc/multi_tensor_adagrad.cu
View file @
85549903
...
...
@@ -90,7 +90,7 @@ void multi_tensor_adagrad_cuda(
using
namespace
at
;
// Assume single type across p,g,h now
DISPATCH_DOUBLE_FLOAT_AND_HALF
(
DISPATCH_DOUBLE_FLOAT_AND_HALF
_AND_BFLOAT16
(
tensor_lists
[
0
][
0
].
scalar_type
(),
0
,
"adagrad"
,
multi_tensor_apply
<
3
>
(
BLOCK_SIZE
,
chunk_size
,
noop_flag
,
tensor_lists
,
AdagradFunctor
<
scalar_t_0
>
(),
epsilon
,
lr
,
...
...
tests/L0/run_optimizers/test_adagrad.py
View file @
85549903
...
...
@@ -14,22 +14,28 @@ class TestFusedAdagrad(unittest.TestCase):
def
tearDown
(
self
):
pass
def
gen_param_optim
(
self
,
tensors
,
adagrad_option
):
def
gen_param_optim
(
self
,
tensors
,
adagrad_option
,
apex_only
=
False
):
ref_param
=
[]
tst_param
=
[]
for
tensor
in
tensors
:
if
apex_only
:
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
().
float
()))
else
:
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
tst_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
if
apex_only
:
ref_optim
=
apex
.
optimizers
.
FusedAdagrad
(
ref_param
,
**
adagrad_option
)
else
:
ref_optim
=
torch
.
optim
.
Adagrad
(
ref_param
,
**
adagrad_option
)
tst_optim
=
apex
.
optimizers
.
FusedAdagrad
(
tst_param
,
**
adagrad_option
)
return
(
ref_param
,
tst_param
,
ref_optim
,
tst_optim
)
def
gen_grad
(
self
,
ref_param
,
tst_param
):
def
gen_grad
(
self
,
ref_param
,
tst_param
,
apex_only
=
False
):
for
p_ref
,
p_tst
in
zip
(
ref_param
,
tst_param
):
p_
ref
.
grad
=
torch
.
rand_like
(
p_
ref
)
p_
tst
.
grad
=
p_
ref
.
grad
p_
tst
.
grad
=
torch
.
rand_like
(
p_
tst
)
p_
ref
.
grad
=
p_
tst
.
grad
.
detach
().
float
()
if
apex_only
else
p_tst
.
grad
def
gen_mixed_grad
(
self
,
ref_param
,
tst_param
,
scale
=
1.0
):
half_grads
=
[]
...
...
@@ -38,9 +44,11 @@ class TestFusedAdagrad(unittest.TestCase):
p_ref
.
grad
=
half_grads
[
-
1
].
float
()
/
scale
return
half_grads
def
get_max_diff
(
self
,
ref_param
,
tst_param
):
def
get_max_diff
(
self
,
ref_param
,
tst_param
,
apex_only
=
False
):
max_abs_diff
=
max_rel_diff
=
0
for
p_ref
,
p_tst
in
zip
(
ref_param
,
tst_param
):
if
apex_only
:
p_tst
=
p_tst
.
float
()
max_abs_diff_p
=
(
p_ref
-
p_tst
).
abs
().
max
().
item
()
max_rel_diff_p
=
((
p_ref
-
p_tst
)
/
p_ref
).
abs
().
max
().
item
()
...
...
@@ -51,22 +59,23 @@ class TestFusedAdagrad(unittest.TestCase):
return
max_abs_diff
,
max_rel_diff
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
):
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
apex_only
=
False
):
nelem
=
278011
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
1.0e-5
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
"cuda"
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
self
.
gen_param_optim
(
[
tensor
],
adagrad_option
[
tensor
],
adagrad_option
,
apex_only
=
apex_only
)
for
_
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
self
.
gen_grad
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
if
not
apex_only
:
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
def
test_float
(
self
):
...
...
@@ -76,6 +85,14 @@ class TestFusedAdagrad(unittest.TestCase):
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
# Compares bfloat16 computation against float32 as gold standard.
# Uses apex optimizers(controlled by apex_only flag) for both types.
# Doesn't use upstream optimizer like other tests as they seem to be
# numerically unstable for half types(see skip note for test above).
def
test_bfloat16
(
self
):
self
.
max_abs_diff
=
1e-2
self
.
gen_single_type_test
(
param_type
=
torch
.
bfloat16
,
apex_only
=
True
)
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
adagrad_option
=
{
"lr"
:
5e-4
,
"eps"
:
1e-08
,
"weight_decay"
:
0
}
...
...
tests/L0/run_optimizers/test_adam.py
View file @
85549903
...
...
@@ -15,22 +15,28 @@ class TestFusedAdam(unittest.TestCase):
def
tearDown
(
self
):
pass
def
gen_param_optim
(
self
,
tensors
,
adam_option
):
def
gen_param_optim
(
self
,
tensors
,
adam_option
,
apex_only
=
False
):
ref_param
=
[]
tst_param
=
[]
for
tensor
in
tensors
:
if
apex_only
:
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
().
float
()))
else
:
ref_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
tst_param
.
append
(
torch
.
nn
.
Parameter
(
tensor
.
clone
()))
if
apex_only
:
ref_optim
=
apex
.
optimizers
.
FusedAdam
(
ref_param
,
**
adam_option
)
else
:
ref_optim
=
torch
.
optim
.
Adam
(
ref_param
,
**
adam_option
)
tst_optim
=
apex
.
optimizers
.
FusedAdam
(
tst_param
,
**
adam_option
)
return
(
ref_param
,
tst_param
,
ref_optim
,
tst_optim
)
def
gen_grad
(
self
,
ref_param
,
tst_param
):
def
gen_grad
(
self
,
ref_param
,
tst_param
,
apex_only
=
False
):
for
p_ref
,
p_tst
in
zip
(
ref_param
,
tst_param
):
p_
ref
.
grad
=
torch
.
rand_like
(
p_
ref
)
p_
tst
.
grad
=
p_
ref
.
grad
p_
tst
.
grad
=
torch
.
rand_like
(
p_
tst
)
p_
ref
.
grad
=
p_
tst
.
grad
.
detach
().
float
()
if
apex_only
else
p_tst
.
grad
def
gen_mixed_grad
(
self
,
ref_param
,
tst_param
,
scale
=
1.0
):
half_grads
=
[]
...
...
@@ -39,9 +45,11 @@ class TestFusedAdam(unittest.TestCase):
p_ref
.
grad
=
half_grads
[
-
1
].
float
()
/
scale
return
half_grads
def
get_max_diff
(
self
,
ref_param
,
tst_param
):
def
get_max_diff
(
self
,
ref_param
,
tst_param
,
apex_only
=
False
):
max_abs_diff
=
max_rel_diff
=
0
for
p_ref
,
p_tst
in
zip
(
ref_param
,
tst_param
):
if
apex_only
:
p_tst
=
p_tst
.
float
()
max_abs_diff_p
=
(
p_ref
-
p_tst
).
abs
().
max
().
item
()
max_rel_diff_p
=
((
p_ref
-
p_tst
)
/
p_ref
).
abs
().
max
().
item
()
...
...
@@ -50,22 +58,23 @@ class TestFusedAdam(unittest.TestCase):
return
max_abs_diff
,
max_rel_diff
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
):
def
gen_single_type_test
(
self
,
param_type
=
torch
.
float
,
apex_only
=
False
):
nelem
=
278011
adam_option
=
{
'lr'
:
5e-4
,
'betas'
:(
0.9
,
0.999
),
'eps'
:
1e-08
,
'weight_decay'
:
0
,
'amsgrad'
:
False
}
tensor
=
torch
.
rand
(
nelem
,
dtype
=
param_type
,
device
=
'cuda'
)
ref_param
,
tst_param
,
ref_optim
,
tst_optim
=
\
self
.
gen_param_optim
([
tensor
],
adam_option
)
self
.
gen_param_optim
([
tensor
],
adam_option
,
apex_only
=
apex_only
)
for
i
in
range
(
self
.
iters
):
self
.
gen_grad
(
ref_param
,
tst_param
)
self
.
gen_grad
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
ref_optim
.
step
()
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
)
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
ref_param
,
tst_param
,
apex_only
=
apex_only
)
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
if
not
apex_only
:
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
def
test_float
(
self
):
...
...
@@ -74,6 +83,14 @@ class TestFusedAdam(unittest.TestCase):
def
test_half
(
self
):
self
.
gen_single_type_test
(
param_type
=
torch
.
float16
)
# Compares bfloat16 computation against float32 as gold standard.
# Uses apex optimizers(controlled by apex_only flag) for both types.
# Doesn't use upstream optimizer like other tests as they seem to be
# numerically unstable for half types
def
test_bfloat16
(
self
):
self
.
max_abs_diff
=
1e-2
self
.
gen_single_type_test
(
param_type
=
torch
.
bfloat16
,
apex_only
=
True
)
@
unittest
.
skip
(
'Disable until 8/1/2019 adam/adamw upstream picked'
)
def
test_multi_params
(
self
):
sizes
=
[[
4096
,
1024
],
[
4096
],
[
4096
,
2048
],
[
32320
,
1024
],
[
1
]]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment