Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
e8fb052f
Commit
e8fb052f
authored
Aug 18, 2021
by
mshoeybi
Browse files
made contiguous buffer in local ddp default
parent
df6e3cd7
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
21 additions
and
25 deletions
+21
-25
megatron/arguments.py
megatron/arguments.py
+7
-11
megatron/optimizer/__init__.py
megatron/optimizer/__init__.py
+2
-2
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+10
-10
megatron/training.py
megatron/training.py
+2
-2
No files found.
megatron/arguments.py
View file @
e8fb052f
...
...
@@ -148,16 +148,11 @@ def parse_args(extra_args_provider=None, defaults={},
print
(
'using {} for parameters ...'
.
format
(
args
.
params_dtype
),
flush
=
True
)
# If we do accumulation and all-reduces in fp32, we need to have
#
local DDP
and we should
set th
e use-contiguous-buffers-in-
ddp
.
# If we do accumulation and all-reduces in fp32, we need to have
local DDP
# and we should
make sur
e use-contiguous-buffers-in-
local-ddp is not off
.
if
args
.
accumulate_allreduce_grads_in_fp32
:
assert
args
.
DDP_impl
==
'local'
args
.
use_contiguous_buffers_in_ddp
=
True
# If we use a contiguous buffer to hold main grads, we need to have
# local DDP.
if
args
.
use_contiguous_buffers_in_ddp
:
assert
args
.
DDP_impl
==
'local'
assert
args
.
use_contiguous_buffers_in_local_ddp
if
args
.
dataloader_type
is
None
:
args
.
dataloader_type
=
'single'
...
...
@@ -584,9 +579,10 @@ def _add_distributed_args(parser):
choices
=
[
'local'
,
'torch'
],
help
=
'which DistributedDataParallel implementation '
'to use.'
)
group
.
add_argument
(
'--use-contiguous-buffers-in-ddp'
,
action
=
'store_true'
,
help
=
'If set, use contiguous buffer in DDP. Note that '
'this option only works woth local DDP.'
)
group
.
add_argument
(
'--no-contiguous-buffers-in-local-ddp'
,
action
=
'store_false'
,
help
=
'If set, dont use '
'contiguous buffer in local DDP.'
,
dest
=
'use_contiguous_buffers_in_local_ddp'
)
group
.
add_argument
(
'--no-scatter-gather-tensors-in-pipeline'
,
action
=
'store_false'
,
help
=
'Use scatter/gather to optimize communication of tensors in pipeline'
,
dest
=
'scatter_gather_tensors_in_pipeline'
)
...
...
megatron/optimizer/__init__.py
View file @
e8fb052f
...
...
@@ -100,7 +100,7 @@ def get_megatron_optimizer(model):
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
,
params_have_main_grad
,
args
.
use_contiguous_buffers_in_ddp
,
args
.
use_contiguous_buffers_in_
local_
ddp
,
args
.
bf16
,
grad_scaler
)
...
...
@@ -108,4 +108,4 @@ def get_megatron_optimizer(model):
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
,
params_have_main_grad
,
args
.
use_contiguous_buffers_in_ddp
)
args
.
use_contiguous_buffers_in_
local_
ddp
)
megatron/optimizer/optimizer.py
View file @
e8fb052f
...
...
@@ -69,7 +69,7 @@ class MegatronOptimizer(ABC):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_ddp
):
use_contiguous_buffers_in_
local_
ddp
):
"""Input optimizer is the base optimizer for example Adam."""
self
.
optimizer
=
optimizer
...
...
@@ -78,9 +78,9 @@ class MegatronOptimizer(ABC):
self
.
clip_grad
=
clip_grad
self
.
log_num_zeros_in_grad
=
log_num_zeros_in_grad
self
.
params_have_main_grad
=
params_have_main_grad
self
.
use_contiguous_buffers_in_ddp
=
use_contiguous_buffers_in_ddp
self
.
use_contiguous_buffers_in_
local_
ddp
=
use_contiguous_buffers_in_
local_
ddp
if
self
.
use_contiguous_buffers_in_ddp
:
if
self
.
use_contiguous_buffers_in_
local_
ddp
:
assert
self
.
params_have_main_grad
,
\
"use of contiguous buffer requires that params have main grad"
...
...
@@ -193,12 +193,12 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
"""
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_ddp
,
params_have_main_grad
,
use_contiguous_buffers_in_
local_
ddp
,
bf16
,
grad_scaler
):
super
(
Float16OptimizerWithFloat16Params
,
self
).
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_ddp
)
params_have_main_grad
,
use_contiguous_buffers_in_
local_
ddp
)
self
.
bf16
=
bf16
self
.
grad_scaler
=
grad_scaler
...
...
@@ -323,7 +323,7 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
# persist and therefore should not be deallocated.)
model_param
.
grad
=
None
if
self
.
params_have_main_grad
and
\
not
self
.
use_contiguous_buffers_in_ddp
:
not
self
.
use_contiguous_buffers_in_
local_
ddp
:
model_param
.
main_grad
=
None
# For fp32 grads, we need to reset the grads to main grad.
...
...
@@ -335,7 +335,7 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
# Safe to de-reference model's main_grad after copying.
# (If using contiguous buffers, main_grad's memory should
# persist and therefore should not be deallocated.)
if
not
self
.
use_contiguous_buffers_in_ddp
:
if
not
self
.
use_contiguous_buffers_in_
local_
ddp
:
model_param
.
main_grad
=
None
def
_unscale_main_grads_and_check_for_nan
(
self
):
...
...
@@ -491,11 +491,11 @@ class FP32Optimizer(MegatronOptimizer):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_ddp
):
use_contiguous_buffers_in_
local_
ddp
):
super
(
FP32Optimizer
,
self
).
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_ddp
)
params_have_main_grad
,
use_contiguous_buffers_in_
local_
ddp
)
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
...
...
@@ -525,7 +525,7 @@ class FP32Optimizer(MegatronOptimizer):
# Safe to de-reference model's main_grad after copying.
# (If using contiguous buffers, main_grad's memory should
# persist and therefore should not be deallocated.)
if
not
self
.
use_contiguous_buffers_in_ddp
:
if
not
self
.
use_contiguous_buffers_in_
local_
ddp
:
param
.
main_grad
=
None
# Clip gradients.
...
...
megatron/training.py
View file @
e8fb052f
...
...
@@ -253,7 +253,7 @@ def get_model(model_provider_func):
if
args
.
DDP_impl
==
'local'
:
model
=
[
LocalDDP
(
model_module
,
args
.
accumulate_allreduce_grads_in_fp32
,
args
.
use_contiguous_buffers_in_ddp
)
args
.
use_contiguous_buffers_in_
local_
ddp
)
for
model_module
in
model
]
return
model
...
...
@@ -351,7 +351,7 @@ def train_step(forward_step_func, data_iterator,
timers
=
get_timers
()
# Set grad to zero.
if
args
.
DDP_impl
==
'local'
and
args
.
use_contiguous_buffers_in_ddp
:
if
args
.
DDP_impl
==
'local'
and
args
.
use_contiguous_buffers_in_
local_
ddp
:
for
partition
in
model
:
partition
.
zero_grad_buffer
()
optimizer
.
zero_grad
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment