Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
9c82241d
Commit
9c82241d
authored
Apr 30, 2020
by
Thor Johnsen
Browse files
Remove implicit memcpy of grad tensor in do_overlapped function
parent
5d1993cf
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
8 deletions
+12
-8
apex/contrib/optimizers/distributed_fused_adam.py
apex/contrib/optimizers/distributed_fused_adam.py
+12
-8
No files found.
apex/contrib/optimizers/distributed_fused_adam.py
View file @
9c82241d
...
...
@@ -99,6 +99,7 @@ class DistributedFusedAdam(torch.optim.Optimizer):
self
.
_param_state
=
None
self
.
_model_params
=
[]
self
.
_grads_info
=
[]
self
.
_grad_accs
=
[]
for
group
in
self
.
param_groups
:
self
.
_param_group
=
group
prev
=
None
...
...
@@ -114,9 +115,12 @@ class DistributedFusedAdam(torch.optim.Optimizer):
self
.
_param_state
=
state
p_grads_size
=
p
.
numel
()
def
wrapper
(
param
,
param_i
,
param_grads_size
,
param_offset
):
def
allreduce_hook
(
grad
):
self
.
_do_overlapped_reduction
(
param_i
,
param_grads_size
,
param_offset
,
grad
)
param
.
register_hook
(
allreduce_hook
)
param_tmp
=
param
.
expand_as
(
param
)
grad_acc
=
param_tmp
.
grad_fn
.
next_functions
[
0
][
0
]
def
allreduce_hook
(
*
unused
):
self
.
_do_overlapped_reduction
(
param_i
,
param_grads_size
,
param_offset
,
param
)
grad_acc
.
register_hook
(
allreduce_hook
)
self
.
_grad_accs
.
append
(
grad_acc
)
self
.
_grads_info
.
append
({
"param_grads_size"
:
p_grads_size
,
"param_offset"
:
p_offset
})
wrapper
(
p
,
p_i
,
p_grads_size
,
p_offset
)
p_offset
+=
p_grads_size
...
...
@@ -160,8 +164,8 @@ class DistributedFusedAdam(torch.optim.Optimizer):
self
.
_fp16_g
=
torch
.
zeros
([
self
.
_mega_shard_size
],
dtype
=
torch
.
float16
,
device
=
'cuda'
)
self
.
_individual_flat_grads
=
[]
for
p_i
,
grads_info
in
enumerate
(
self
.
_grads_info
):
self
.
_individual_flat_grads
.
append
(
self
.
_flat_grads
[
grads_info
[
"param_offset"
]:
grads_info
[
"param_offset"
]
+
grads_info
[
"param_grads_size"
]])
for
p_i
,
(
grads_info
,
p
)
in
enumerate
(
zip
(
self
.
_grads_info
,
self
.
_model_params
)
):
self
.
_individual_flat_grads
.
append
(
self
.
_flat_grads
[
grads_info
[
"param_offset"
]:
grads_info
[
"param_offset"
]
+
grads_info
[
"param_grads_size"
]]
.
view_as
(
p
)
)
def
_flat_split
(
p
):
def
__blockify
(
p
):
...
...
@@ -412,12 +416,12 @@ class DistributedFusedAdam(torch.optim.Optimizer):
scale
)
self
.
_grads
=
[]
def
_do_overlapped_reduction
(
self
,
param_i
,
param_grads_size
,
param_offset
,
g
ra
d
):
def
_do_overlapped_reduction
(
self
,
param_i
,
param_grads_size
,
param_offset
,
pa
ra
m
):
# handle overlapped reductions
if
self
.
_flat_mt
:
self
.
_grads
.
append
(
(
grad
.
view
(
-
1
)
,
self
.
_individual_flat_grads
[
param_i
])
)
self
.
_grads
.
append
(
(
param
.
grad
,
self
.
_individual_flat_grads
[
param_i
])
)
else
:
torch
.
div
(
grad
.
view
(
-
1
)
,
self
.
_world_size
if
self
.
_predivide
else
1.0
,
out
=
self
.
_flat_grads
[
param_
offset
:
param_offset
+
param_grads_size
])
torch
.
div
(
param
.
grad
,
self
.
_world_size
if
self
.
_predivide
else
1.0
,
out
=
self
.
_
individual_
flat_grads
[
param_
i
])
self
.
_grads_generated
[
param_i
]
=
True
if
not
self
.
_last_step
:
if
self
.
_overlap_reductions
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment