Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
4d9dcb57
"fair_dev/testing/testing.py" did not exist on "7d7edf6d37576fb6eda65db6db43fda54a7f06ba"
Commit
4d9dcb57
authored
Dec 07, 2018
by
Deyu Fu
Browse files
address comments
parent
be42aad5
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
38 additions
and
12 deletions
+38
-12
apex/optimizers/fp16_optimizer.py
apex/optimizers/fp16_optimizer.py
+17
-3
apex/optimizers/fused_adam.py
apex/optimizers/fused_adam.py
+21
-9
No files found.
apex/optimizers/fp16_optimizer.py
View file @
4d9dcb57
import
torch
from
torch._utils
import
_flatten_dense_tensors
,
_unflatten_dense_tensors
import
ctypes
lib
=
ctypes
.
cdll
.
LoadLibrary
(
None
)
lib
.
THCudaHalfTensor_normall
.
argtypes
=
[
ctypes
.
c_void_p
,
ctypes
.
c_void_p
]
lib
.
THCudaHalfTensor_normall
.
restype
=
ctypes
.
c_float
def
fused_norm
(
input
):
if
input
.
type
()
==
'torch.cuda.HalfTensor'
:
# 16384 is half 2 if you stare at it long enough
return
lib
.
THCudaHalfTensor_normall
(
torch
.
cuda
.
_state_cdata
,
input
.
_cdata
,
16384
)
else
:
return
input
.
norm
()
class
FP16_Optimizer
(
object
):
"""
...
...
@@ -115,7 +128,8 @@ class FP16_Optimizer(object):
Returns -1 if the most recently computed fp16 gradients overflowed
"""
# TODO: currently using pre-1.0 api, and not most efficient with copy to cpu and sync
norm
=
float
(
torch
.
norm
(
fp16_grads_flat
,
p
=
norm_type
))
# only support 2-norm now
norm
=
float
(
fused_norm
(
fp16_grads_flat
))
if
norm
==
float
(
'inf'
)
or
norm
==
-
float
(
'inf'
)
or
norm
!=
norm
:
return
-
1
else
:
...
...
@@ -140,8 +154,8 @@ class FP16_Optimizer(object):
return
# norm is in fact norm*cur_scale
self
.
optimizer
.
step
(
grads
_group
=
[[
g
]
for
g
in
grads_groups_flat
],
output_params
_group
=
[[
p
]
for
p
in
self
.
fp16_groups_flat
],
self
.
optimizer
.
step
(
grads
=
[[
g
]
for
g
in
grads_groups_flat
],
output_params
=
[[
p
]
for
p
in
self
.
fp16_groups_flat
],
scale
=
self
.
cur_scale
,
grad_norms
=
norm_groups
)
...
...
apex/optimizers/fused_adam.py
View file @
4d9dcb57
...
...
@@ -65,7 +65,7 @@ class FusedAdam(torch.optim.Optimizer):
super
(
FusedAdam
,
self
).
__init__
(
params
,
defaults
)
self
.
eps_mode
=
0
if
eps_inside_sqrt
else
1
def
step
(
self
,
closure
=
None
,
grads
_group
=
None
,
output_params
_group
=
None
,
scale
=
1.
,
grad_norms
=
None
):
def
step
(
self
,
closure
=
None
,
grads
=
None
,
output_params
=
None
,
scale
=
1.
,
grad_norms
=
None
):
"""Performs a single optimization step.
Arguments:
...
...
@@ -84,18 +84,30 @@ class FusedAdam(torch.optim.Optimizer):
if
closure
is
not
None
:
loss
=
closure
()
if
grads
_group
is
None
:
if
grads
is
None
:
grads_group
=
[
None
]
*
len
(
self
.
param_groups
)
if
output_params_group
is
None
:
# backward compatibility
# assuming a list of parameter means single group
elif
type
(
grads
[
0
])
!=
list
:
grads_group
=
[
grads
]
else
:
grads_group
=
grads
if
output_params
is
None
:
output_params_group
=
[
None
]
*
len
(
self
.
param_groups
)
elif
type
(
output_params
[
0
])
!=
list
:
output_params_group
=
[
output_params
]
else
:
output_params_group
=
output_params
if
grad_norms
is
None
:
grad_norms
=
[
None
]
*
len
(
self
.
param_groups
)
for
group
,
grads
,
output_params
,
grad_norm
in
zip
(
self
.
param_groups
,
grads_group
,
output_params_group
,
grad_norms
):
if
grads
is
None
:
grads
=
[
None
]
*
len
(
group
[
'params'
])
if
output_params
is
None
:
output_params
=
[
None
]
*
len
(
group
[
'params'
])
for
group
,
grads
_this_group
,
output_params
_this_group
,
grad_norm
in
zip
(
self
.
param_groups
,
grads_group
,
output_params_group
,
grad_norms
):
if
grads
_this_group
is
None
:
grads
_this_group
=
[
None
]
*
len
(
group
[
'params'
])
if
output_params
_this_group
is
None
:
output_params
_this_group
=
[
None
]
*
len
(
group
[
'params'
])
# compute combined scale factor for this group
combined_scale
=
scale
...
...
@@ -105,7 +117,7 @@ class FusedAdam(torch.optim.Optimizer):
if
clip
>
1
:
combined_scale
=
clip
*
scale
for
p
,
grad
,
output_param
in
zip
(
group
[
'params'
],
grads
,
output_params
):
for
p
,
grad
,
output_param
in
zip
(
group
[
'params'
],
grads
_this_group
,
output_params
_this_group
):
#note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
if
p
.
grad
is
None
and
grad
is
None
:
continue
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment