Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
862d70fc
Commit
862d70fc
authored
Mar 10, 2022
by
Lawrence McAfee
Browse files
small fixes.
parent
08ee8ea2
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
65 additions
and
63 deletions
+65
-63
megatron/model/distributed.py
megatron/model/distributed.py
+0
-8
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+63
-53
megatron/training.py
megatron/training.py
+2
-2
No files found.
megatron/model/distributed.py
View file @
862d70fc
...
@@ -126,19 +126,11 @@ class DistributedDataParallel(DistributedDataParallelBase):
...
@@ -126,19 +126,11 @@ class DistributedDataParallel(DistributedDataParallelBase):
# the case we use continuous buffers.
# the case we use continuous buffers.
# ===================================
# ===================================
self
.
_grad_buffers
=
None
self
.
_grad_buffers
=
None
# >>>
# from collections import defaultdict
# self._grad_buffer_param_offsets = None
self
.
_grad_buffer_param_index_map
=
None
self
.
_grad_buffer_param_index_map
=
None
# <<<
if
self
.
use_contiguous_buffers
:
if
self
.
use_contiguous_buffers
:
self
.
_grad_buffers
=
{}
self
.
_grad_buffers
=
{}
# >>>
# self._grad_buffer_param_offsets = defaultdict(dict)
# self._grad_buffer_param_index_map = defaultdict(dict)
self
.
_grad_buffer_param_index_map
=
{}
self
.
_grad_buffer_param_index_map
=
{}
data_parallel_world_size
=
mpu
.
get_data_parallel_world_size
()
data_parallel_world_size
=
mpu
.
get_data_parallel_world_size
()
# <<<
# Simple function to define buffer type.
# Simple function to define buffer type.
def
_get_buffer_type
(
param
):
def
_get_buffer_type
(
param
):
...
...
megatron/optimizer/optimizer.py
View file @
862d70fc
...
@@ -34,7 +34,7 @@ from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
...
@@ -34,7 +34,7 @@ from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
# >>>
# >>>
from
lutil
import
pax
,
tp
from
lutil
import
pax
,
tp
DEBUG_ITERATION
=
2
# 10
DEBUG_ITERATION
=
1
# 10
# <<<
# <<<
...
@@ -239,6 +239,9 @@ class MegatronOptimizer(ABC):
...
@@ -239,6 +239,9 @@ class MegatronOptimizer(ABC):
torch
.
distributed
.
all_reduce
(
grad
,
group
=
mpu
.
get_position_embedding_group
())
torch
.
distributed
.
all_reduce
(
grad
,
group
=
mpu
.
get_position_embedding_group
())
def
allreduce_embedding_grads
(
self
,
args
):
def
allreduce_embedding_grads
(
self
,
args
):
# >>>
# return # ** .. TEMPORARY .. **
# <<<
self
.
allreduce_word_embedding_grads
(
args
)
self
.
allreduce_word_embedding_grads
(
args
)
self
.
allreduce_position_embedding_grads
(
args
)
self
.
allreduce_position_embedding_grads
(
args
)
...
@@ -330,58 +333,60 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -330,58 +333,60 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
return
found_inf_flag
return
found_inf_flag
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# @classmethod
@
classmethod
# def debug_base(cls, ITERATION, key, value):
def
debug_base
(
cls
,
ITERATION
,
key
,
value
):
# from megatron import get_args
from
megatron
import
get_args
# args = get_args()
args
=
get_args
()
# my_rank = torch.distributed.get_rank()
my_rank
=
torch
.
distributed
.
get_rank
()
# if ITERATION != DEBUG_ITERATION:
if
ITERATION
!=
DEBUG_ITERATION
:
# return
return
# for r in range(torch.distributed.get_world_size()):
for
r
in
range
(
torch
.
distributed
.
get_world_size
()):
# if my_rank == r:
if
my_rank
==
r
:
# print(" + br/%s; [r%d, i%d]; %s, %.12e" % ("fix " if args.use_distributed_optimizer else "main", my_rank, ITERATION, key, value))
# prefix = " + "
# torch.distributed.barrier()
prefix
=
""
# torch.distributed.barrier()
print
(
"%sbr/%s; [r%d, i%d]; %s, %.12e"
%
(
prefix
,
"fix "
if
args
.
use_distributed_optimizer
else
"main"
,
my_rank
,
ITERATION
,
key
,
value
))
# # if my_rank == 0:
torch
.
distributed
.
barrier
()
# # raise Exception("debug.")
torch
.
distributed
.
barrier
()
# # else:
# if my_rank == 0:
# # exit(0)
# raise Exception("debug.")
# exit(0)
# else:
# def debug_model(self, ITERATION, key, use_grad):
# exit(0)
# use_grad = bool(use_grad)
exit
(
0
)
# tensors = [
def
debug_model
(
self
,
ITERATION
,
key
,
use_grad
):
# (p.main_grad.float() if use_grad else p.float())
use_grad
=
bool
(
use_grad
)
# for m in self.models for p in m.parameters()
tensors
=
[
# ]
(
p
.
main_grad
.
float
()
if
use_grad
else
p
.
float
())
# count = sum(t.nelement() for t in tensors)
for
m
in
self
.
models
for
p
in
m
.
parameters
()
# return self.debug_base(
]
# ITERATION,
count
=
sum
(
t
.
nelement
()
for
t
in
tensors
)
# "model/%s, %s [count %d]" % (
return
self
.
debug_base
(
# "grad" if use_grad else "param",
ITERATION
,
# key,
"model/%s, %s [count %d]"
%
(
# count,
"grad"
if
use_grad
else
"param"
,
# ),
key
,
# # sum(torch.sum(torch.abs(t)) for t in tensors).item() / count,
count
,
# sum(torch.sum(torch.abs(t)) for t in tensors),
),
# )
# sum(torch.sum(torch.abs(t)) for t in tensors).item() / count,
# def debug_main(self, ITERATION, key, use_grad):
sum
(
torch
.
sum
(
torch
.
abs
(
t
))
for
t
in
tensors
),
# use_grad = bool(use_grad)
)
# tensors = [
def
debug_main
(
self
,
ITERATION
,
key
,
use_grad
):
# p.grad if use_grad else p
use_grad
=
bool
(
use_grad
)
# for g in self.optimizer.param_groups
tensors
=
[
# for p in g["params"]
p
.
grad
if
use_grad
else
p
# ]
for
g
in
self
.
optimizer
.
param_groups
# tensors = [ t.float() for t in tensors ]
for
p
in
g
[
"params"
]
# count = sum(t.nelement() for t in tensors)
]
# return self.debug_base(
tensors
=
[
t
.
float
()
for
t
in
tensors
]
# ITERATION,
count
=
sum
(
t
.
nelement
()
for
t
in
tensors
)
# "main/%s, %s [count %d]" % (
return
self
.
debug_base
(
# "grad" if use_grad else "param",
ITERATION
,
# key,
"main/%s, %s [count %d]"
%
(
# count,
"grad"
if
use_grad
else
"param"
,
# ),
key
,
# sum(torch.sum(torch.abs(t)) for t in tensors),
count
,
# )
),
sum
(
torch
.
sum
(
torch
.
abs
(
t
))
for
t
in
tensors
),
)
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@
torch
.
no_grad
()
@
torch
.
no_grad
()
...
@@ -433,6 +438,11 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -433,6 +438,11 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
self
.
_copy_main_params_to_model_params
(
ITERATION
)
self
.
_copy_main_params_to_model_params
(
ITERATION
)
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
# >>>
# self.debug_model(ITERATION, "after copy param.", 0)
# self.debug_main(ITERATION, "after copy param.", 0)
# <<<
# Successful update.
# Successful update.
return
True
,
grad_norm
,
num_zeros_in_grad
return
True
,
grad_norm
,
num_zeros_in_grad
...
...
megatron/training.py
View file @
862d70fc
...
@@ -432,7 +432,7 @@ def train_step(forward_step_func, data_iterator,
...
@@ -432,7 +432,7 @@ def train_step(forward_step_func, data_iterator,
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
# >>>
# >>>
# optimizer.debug_model(ITERATION, "before reduce grads.",
0
)
# optimizer.debug_model(ITERATION, "before reduce grads.",
1
)
# <<<
# <<<
# >>>
# >>>
...
@@ -451,7 +451,7 @@ def train_step(forward_step_func, data_iterator,
...
@@ -451,7 +451,7 @@ def train_step(forward_step_func, data_iterator,
# <<<
# <<<
# >>>
# >>>
#
optimizer.debug_model(ITERATION, "after gather params.", 0)
optimizer
.
debug_model
(
ITERATION
,
"after gather params."
,
0
)
# <<<
# <<<
# Update learning rate.
# Update learning rate.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment