Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
67e23459
Commit
67e23459
authored
Feb 28, 2022
by
Lawrence McAfee
Browse files
more debugging; some cleanup
parent
a58653dc
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
154 additions
and
90 deletions
+154
-90
megatron/optimizer/__init__.py
megatron/optimizer/__init__.py
+6
-1
megatron/optimizer/clip_grads.py
megatron/optimizer/clip_grads.py
+54
-27
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+94
-62
No files found.
megatron/optimizer/__init__.py
View file @
67e23459
...
@@ -154,7 +154,7 @@ def get_megatron_optimizer(model,
...
@@ -154,7 +154,7 @@ def get_megatron_optimizer(model,
opt_ty
=
Float16DistributedOptimizer
\
opt_ty
=
Float16DistributedOptimizer
\
if
args
.
use_distributed_optimizer
\
if
args
.
use_distributed_optimizer
\
else
Float16OptimizerWithFloat16Params
else
Float16OptimizerWithFloat16Params
return
opt_ty
(
optimizer
,
opt
=
opt_ty
(
optimizer
,
args
.
clip_grad
,
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
,
args
.
log_num_zeros_in_grad
,
params_have_main_grad
,
params_have_main_grad
,
...
@@ -162,6 +162,11 @@ def get_megatron_optimizer(model,
...
@@ -162,6 +162,11 @@ def get_megatron_optimizer(model,
args
.
bf16
,
args
.
bf16
,
grad_scaler
,
grad_scaler
,
model
)
model
)
# >>>
# opt.debug_main_param_sum(0, "after init")
# opt.debug_main_grad_sum(0, "after init")
# <<<
return
opt
# <<<
# <<<
# FP32.
# FP32.
...
...
megatron/optimizer/clip_grads.py
View file @
67e23459
...
@@ -28,9 +28,10 @@ from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
...
@@ -28,9 +28,10 @@ from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
# >>>
# >>>
from
lutil
import
pax
,
tp
from
lutil
import
pax
,
tp
DEBUG_ITERATION
=
1
# <<<
# <<<
def
clip_grad_norm_fp32
(
parameters
,
max_norm
,
norm_type
=
2
):
def
clip_grad_norm_fp32
(
parameters
,
max_norm
,
norm_type
=
2
,
ITERATION
=
None
):
"""Clips gradient norm of an iterable of parameters whose gradients
"""Clips gradient norm of an iterable of parameters whose gradients
are in fp32.
are in fp32.
...
@@ -49,6 +50,10 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
...
@@ -49,6 +50,10 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
Total norm of the parameters (viewed as a single vector).
Total norm of the parameters (viewed as a single vector).
"""
"""
# >>>
raise
Exception
(
"currently debugging ... don't call me."
)
# <<<
if
isinstance
(
parameters
,
torch
.
Tensor
):
if
isinstance
(
parameters
,
torch
.
Tensor
):
parameters
=
[
parameters
]
parameters
=
[
parameters
]
...
@@ -80,13 +85,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
...
@@ -80,13 +85,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
# <<<
# <<<
# >>>
# >>>
# pax(0, {
# if ITERATION == DEBUG_ITERATION:
# "[LOC]" : "[** BEFORE CALC NORM **]",
# pax(0, {
# "max_norm" : max_norm,
# "[LOC]" : "[** BEFORE CALC NORM **]",
# "parameters" : parameters,
# "[ITERATION]" : ITERATION,
# "grads" : grads,
# "max_norm" : max_norm,
# "grads_for_norm" : grads_for_norm,
# "parameters" : parameters,
# })
# # "grads" : grads,
# "grads_for_norm" : grads_for_norm,
# })
# <<<
# <<<
# Norm parameters.
# Norm parameters.
...
@@ -133,34 +140,42 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
...
@@ -133,34 +140,42 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
total_norm
+=
grad_norm
**
norm_type
total_norm
+=
grad_norm
**
norm_type
# >>>
# >>>
# pax(0, {
# if ITERATION == DEBUG_ITERATION:
# "[LOC]" : "[** CALC NORM **]",
# pax(0, {
# "max_norm" : max_norm,
# "[LOC]" : "[** CALC NORM **]",
# "norm_type" : norm_type,
# "[ITERATION]" : ITERATION,
# "grad_norm" : tp(grad_norm),
# "max_norm" : max_norm,
# "total_norm" : tp(total_norm),
# "norm_type" : norm_type,
# })
# "grad_norm" : tp(grad_norm),
# "total_norm" : tp(total_norm),
# })
# <<<
# <<<
# Sum across all model-parallel GPUs.
# Sum across all model-parallel GPUs.
# >>>
# >>>
# torch.distributed.all_reduce(total_norm,
from
megatron
import
get_args
# op=torch.distributed.ReduceOp.SUM,
args
=
get_args
()
# group=mpu.get_model_parallel_group())
if
not
args
.
use_distributed_optimizer
:
torch
.
distributed
.
all_reduce
(
total_norm
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
mpu
.
get_model_parallel_group
())
# +++
# +++
torch
.
distributed
.
all_reduce
(
total_norm
,
else
:
op
=
torch
.
distributed
.
ReduceOp
.
SUM
)
torch
.
distributed
.
all_reduce
(
total_norm
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
)
# <<<
# <<<
total_norm
=
total_norm
.
item
()
**
(
1.0
/
norm_type
)
total_norm
=
total_norm
.
item
()
**
(
1.0
/
norm_type
)
# >>>
# >>>
# pax(1, {
# if ITERATION == DEBUG_ITERATION:
# "[LOC]" : "[** CALC NORM **]",
# pax(0, {
# "max_norm" : max_norm,
# "[LOC]" : "[** AFTER REDUCE. **]",
# "norm_type" : norm_type,
# "[ITERATION]" : ITERATION,
# "grad_norm" : tp(grad_norm),
# "max_norm" : max_norm,
# "total_norm" : tp(total_norm),
# "norm_type" : norm_type,
# })
# "grad_norm" : grad_norm.item(),
# "total_norm" : total_norm,
# })
# <<<
# <<<
# Scale.
# Scale.
...
@@ -172,6 +187,18 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
...
@@ -172,6 +187,18 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
[
grads
,
grads
],
[
grads
,
grads
],
clip_coeff
)
clip_coeff
)
# >>>
# # from pygit2 import Repository
# if ITERATION == DEBUG_ITERATION:
# pax(1, {
# "[LOC]" : "[** CLIP / FINAL **]",
# "[ITERATION]" : ITERATION,
# "grads" : grads,
# "clip_coeff" : tp(clip_coeff),
# # "repo" : Repository('.').head.shorthand,
# })
# <<<
return
total_norm
return
total_norm
...
...
megatron/optimizer/optimizer.py
View file @
67e23459
...
@@ -32,7 +32,7 @@ from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
...
@@ -32,7 +32,7 @@ from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
# >>>
# >>>
from
lutil
import
pax
,
tp
from
lutil
import
pax
,
tp
DEBUG_ITERATION
=
1
# 10
DEBUG_ITERATION
=
0
# 10
# <<<
# <<<
...
@@ -98,16 +98,12 @@ class MegatronOptimizer(ABC):
...
@@ -98,16 +98,12 @@ class MegatronOptimizer(ABC):
return
params
return
params
def
clip_grad_norm
(
self
,
clip_grad
):
def
clip_grad_norm
(
self
,
clip_grad
,
ITERATION
):
params
=
self
.
get_parameters
()
# >>>
# >>>
# pax(0, {
return
# "clip_grad" : clip_grad,
# # "params": [ (p.tensor_model_parallel, tp(p)) for p in params ],
# "grads" : [ p.grad for p in params ],
# })
# <<<
# <<<
return
clip_grad_norm_fp32
(
params
,
clip_grad
)
params
=
self
.
get_parameters
()
return
clip_grad_norm_fp32
(
params
,
clip_grad
,
ITERATION
=
ITERATION
)
def
count_zeros
(
self
):
def
count_zeros
(
self
):
...
@@ -267,6 +263,73 @@ class BaseFloat16Optimizer(MegatronOptimizer):
...
@@ -267,6 +263,73 @@ class BaseFloat16Optimizer(MegatronOptimizer):
return
found_inf_flag
return
found_inf_flag
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
@
classmethod
def
debug_general
(
cls
,
ITERATION
,
key
,
value
):
from
megatron
import
get_args
args
=
get_args
()
my_rank
=
torch
.
distributed
.
get_rank
()
if
ITERATION
!=
DEBUG_ITERATION
:
return
for
r
in
range
(
torch
.
distributed
.
get_world_size
()):
if
my_rank
==
r
:
print
(
" + %4s; [r%d]; %s, %.12e."
%
(
"fix"
if
args
.
use_distributed_optimizer
else
"main"
,
my_rank
,
key
,
value
))
torch
.
distributed
.
barrier
()
torch
.
distributed
.
barrier
()
# if my_rank == 0:
# raise Exception("debug.")
# else:
# exit(0)
exit
(
0
)
def
_debug_main
(
self
,
ITERATION
,
key0
,
key1
,
f
,
ff
):
count
=
sum
(
p
.
nelement
()
for
g
in
self
.
optimizer
.
param_groups
for
p
in
g
[
"params"
]
)
return
self
.
debug_general
(
ITERATION
,
"main/%s, %s [count %d]"
%
(
key1
,
key0
,
count
),
sum
(
ff
(
f
(
p
))
for
g
in
self
.
optimizer
.
param_groups
for
p
in
g
[
"params"
]).
item
()
/
count
,
)
# def debug_main_param_mean(self, ITERATION, key):
# return self._debug_main(
# ITERATION,
# key,
# "param mean",
# lambda p : p,
# torch.mean,
# )
def
debug_main_param_sum
(
self
,
ITERATION
,
key
):
return
self
.
_debug_main
(
ITERATION
,
key
,
"param sum"
,
# lambda p : p,
lambda
p
:
torch
.
abs
(
p
),
torch
.
sum
,
)
# def debug_main_grad_mean(self, ITERATION, key):
# return self._debug_main(
# ITERATION,
# key,
# "grad mean",
# lambda p : p.grad,
# torch.mean,
# )
def
debug_main_grad_sum
(
self
,
ITERATION
,
key
):
return
self
.
_debug_main
(
ITERATION
,
key
,
"grad sum"
,
# lambda p : p.grad,
lambda
p
:
torch
.
abs
(
p
.
grad
),
torch
.
sum
,
)
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@
torch
.
no_grad
()
@
torch
.
no_grad
()
def
step
(
self
,
ITERATION
):
def
step
(
self
,
ITERATION
):
...
@@ -279,18 +342,10 @@ class BaseFloat16Optimizer(MegatronOptimizer):
...
@@ -279,18 +342,10 @@ class BaseFloat16Optimizer(MegatronOptimizer):
timers
(
'optimizer-copy-to-main-grad'
).
stop
()
timers
(
'optimizer-copy-to-main-grad'
).
stop
()
# >>>
# >>>
# pax(0, {
# self.debug_main_param_sum(ITERATION)
# "[LOC]" : "[** BEFORE UNSCALE **]",
# self.debug_main_grad_sum(ITERATION)
# "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
# "param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
# })
# <<<
# <<<
# pax(0, {
# "params" : self.get_parameters(), # self.main_param_shards,
# "grads" : [ p.grad for p in self.get_parameters() ], # self.main_param_shards ],
# })
# Do unscale, check for inf, and update grad scaler only for
# Do unscale, check for inf, and update grad scaler only for
# the case that grad scaler is provided.
# the case that grad scaler is provided.
if
self
.
grad_scaler
:
if
self
.
grad_scaler
:
...
@@ -313,56 +368,33 @@ class BaseFloat16Optimizer(MegatronOptimizer):
...
@@ -313,56 +368,33 @@ class BaseFloat16Optimizer(MegatronOptimizer):
})
})
return
False
,
None
,
None
return
False
,
None
,
None
# >>>
# pax(0, {
# "[LOC]" : "[** BEFORE CLIP **]",
# "clip_grad" : self.clip_grad,
# # "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
# "param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
# })
# <<<
# Clip the main gradients.
# Clip the main gradients.
timers
(
'optimizer-clip-main-grad'
).
start
()
timers
(
'optimizer-clip-main-grad'
).
start
()
grad_norm
=
None
grad_norm
=
None
if
self
.
clip_grad
>
0.0
:
if
self
.
clip_grad
>
0.0
:
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
,
ITERATION
)
timers
(
'optimizer-clip-main-grad'
).
stop
()
timers
(
'optimizer-clip-main-grad'
).
stop
()
# >>>
pax
(
1
,
{
"[LOC]"
:
"[** BEFORE NONZERO **]"
,
# "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
"param_group / grads"
:
[
p
.
grad
for
g
in
self
.
optimizer
.
param_groups
for
p
in
g
[
"params"
]
],
})
# <<<
# count the zeros in the grads
# count the zeros in the grads
num_zeros_in_grad
=
self
.
count_zeros
()
if
\
num_zeros_in_grad
=
self
.
count_zeros
()
if
\
self
.
log_num_zeros_in_grad
else
None
self
.
log_num_zeros_in_grad
else
None
# >>>
pax
(
0
,
{
# "main params" : self.get_main_params(),
# "main grads" : self.get_main_grads(),
**
{
"param_groups / %d"
%
i
:
g
for
i
,
g
in
enumerate
(
self
.
optimizer
.
param_groups
)},
"param_group / grads"
:
[
p
.
grad
for
g
in
self
.
optimizer
.
param_groups
for
p
in
g
[
"params"
]
],
})
# <<<
# Step the optimizer.
# Step the optimizer.
self
.
optimizer
.
step
()
self
.
optimizer
.
step
()
# >>>
# self.debug_main_param_sum(ITERATION, "after step.")
self
.
debug_main_grad_sum
(
ITERATION
,
"after step."
)
# <<<
# Update params from main params.
# Update params from main params.
timers
(
'optimizer-copy-main-to-model-params'
).
start
()
timers
(
'optimizer-copy-main-to-model-params'
).
start
()
self
.
_copy_main_params_to_model_params
(
ITERATION
)
self
.
_copy_main_params_to_model_params
(
ITERATION
)
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
# >>>
# >>>
# pax(1, {
self
.
debug_main_param_sum
(
ITERATION
,
"after copy param."
)
# "ITERATION" : ITERATION,
self
.
debug_main_grad_sum
(
ITERATION
,
"after copy param."
)
# "model_params" : [ p for m in self.models for p in m.parameters() ],
# })
# <<<
# <<<
# Successful update.
# Successful update.
...
@@ -674,12 +706,12 @@ class Float16OptimizerWithFloat16Params(BaseFloat16Optimizer):
...
@@ -674,12 +706,12 @@ class Float16OptimizerWithFloat16Params(BaseFloat16Optimizer):
_multi_tensor_copy_this_to_that
(
this
=
main_data
,
that
=
model_data
,
_multi_tensor_copy_this_to_that
(
this
=
main_data
,
that
=
model_data
,
overflow_buf
=
self
.
_dummy_overflow_buf
)
overflow_buf
=
self
.
_dummy_overflow_buf
)
# >>>
# >>>
if
ITERATION
==
DEBUG_ITERATION
:
#
if ITERATION == DEBUG_ITERATION:
pax
(
0
,
{
#
pax(0, {
"** branch **"
:
"** main. **"
,
#
"** branch **" : "** main. **",
"ITERATION"
:
ITERATION
,
#
"ITERATION" : ITERATION,
"model params"
:
[
p
for
m
in
self
.
models
for
p
in
m
.
parameters
()],
#
"model params" : [p for m in self.models for p in m.parameters()],
})
#
})
# <<<
# <<<
...
@@ -1370,12 +1402,12 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
...
@@ -1370,12 +1402,12 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
# })
# })
# >>>
# >>>
if
ITERATION
==
DEBUG_ITERATION
:
#
if ITERATION == DEBUG_ITERATION:
pax
(
0
,
{
#
pax(0, {
"** branch **"
:
"** fix. **"
,
#
"** branch **" : "** fix. **",
"ITERATION"
:
ITERATION
,
#
"ITERATION" : ITERATION,
"model params"
:
self
.
get_world_model_params
(),
#
"model params" : self.get_world_model_params(),
})
#
})
# <<<
# <<<
# <<<
# <<<
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment