Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
07916bf2
Commit
07916bf2
authored
Sep 27, 2022
by
Jared Casper
Browse files
Support gradient accumulation fusion in fp16.
parent
2366716f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
9 deletions
+6
-9
megatron/arguments.py
megatron/arguments.py
+0
-8
megatron/core/tensor_parallel/layers.py
megatron/core/tensor_parallel/layers.py
+6
-1
No files found.
megatron/arguments.py
View file @
07916bf2
...
@@ -168,14 +168,6 @@ def validate_args(args, defaults={}):
...
@@ -168,14 +168,6 @@ def validate_args(args, defaults={}):
if
args
.
accumulate_allreduce_grads_in_fp32
:
if
args
.
accumulate_allreduce_grads_in_fp32
:
assert
args
.
DDP_impl
==
'local'
assert
args
.
DDP_impl
==
'local'
assert
args
.
use_contiguous_buffers_in_local_ddp
assert
args
.
use_contiguous_buffers_in_local_ddp
else
:
if
args
.
gradient_accumulation_fusion
:
args
.
gradient_accumulation_fusion
=
False
if
args
.
rank
==
0
:
print
(
'Gradient accumulation fusion to linear layer weight '
'gradient computation is supported only with fp32 '
'gradient accumulation. Setting gradient_accumulation_fusion '
'to False'
,
flush
=
True
)
# If we use the distributed optimizer, we need to have local DDP
# If we use the distributed optimizer, we need to have local DDP
# and we should make sure use-contiguous-buffers-in-local-ddp is on.
# and we should make sure use-contiguous-buffers-in-local-ddp is on.
...
...
megatron/core/tensor_parallel/layers.py
View file @
07916bf2
...
@@ -302,7 +302,12 @@ class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
...
@@ -302,7 +302,12 @@ class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
if
ctx
.
gradient_accumulation_fusion
:
if
ctx
.
gradient_accumulation_fusion
:
fused_weight_gradient_mlp_cuda
.
wgrad_gemm_accum_fp32
(
total_input
,
grad_output
,
weight
.
main_grad
)
if
weight
.
main_grad
.
dtype
==
torch
.
float32
:
fused_weight_gradient_mlp_cuda
.
wgrad_gemm_accum_fp32
(
total_input
,
grad_output
,
weight
.
main_grad
)
elif
weight
.
main_grad
.
dtype
==
torch
.
float16
:
fused_weight_gradient_mlp_cuda
.
wgrad_gemm_accum_fp16
(
total_input
,
grad_output
,
weight
.
main_grad
)
else
:
raise
RuntimeError
(
"Unsupported gradient type for gradient accumulation fusion"
)
grad_weight
=
None
grad_weight
=
None
else
:
else
:
grad_weight
=
grad_output
.
t
().
matmul
(
total_input
)
grad_weight
=
grad_output
.
t
().
matmul
(
total_input
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment