Commit 67e23459 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

more debugging; some cleanup

parent a58653dc
...@@ -154,7 +154,7 @@ def get_megatron_optimizer(model, ...@@ -154,7 +154,7 @@ def get_megatron_optimizer(model,
opt_ty = Float16DistributedOptimizer \ opt_ty = Float16DistributedOptimizer \
if args.use_distributed_optimizer \ if args.use_distributed_optimizer \
else Float16OptimizerWithFloat16Params else Float16OptimizerWithFloat16Params
return opt_ty(optimizer, opt = opt_ty(optimizer,
args.clip_grad, args.clip_grad,
args.log_num_zeros_in_grad, args.log_num_zeros_in_grad,
params_have_main_grad, params_have_main_grad,
...@@ -162,6 +162,11 @@ def get_megatron_optimizer(model, ...@@ -162,6 +162,11 @@ def get_megatron_optimizer(model,
args.bf16, args.bf16,
grad_scaler, grad_scaler,
model) model)
# >>>
# opt.debug_main_param_sum(0, "after init")
# opt.debug_main_grad_sum(0, "after init")
# <<<
return opt
# <<< # <<<
# FP32. # FP32.
......
...@@ -28,9 +28,10 @@ from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate ...@@ -28,9 +28,10 @@ from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
# >>> # >>>
from lutil import pax, tp from lutil import pax, tp
DEBUG_ITERATION = 1
# <<< # <<<
def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, ITERATION=None):
"""Clips gradient norm of an iterable of parameters whose gradients """Clips gradient norm of an iterable of parameters whose gradients
are in fp32. are in fp32.
...@@ -49,6 +50,10 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): ...@@ -49,6 +50,10 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
Total norm of the parameters (viewed as a single vector). Total norm of the parameters (viewed as a single vector).
""" """
# >>>
raise Exception("currently debugging ... don't call me.")
# <<<
if isinstance(parameters, torch.Tensor): if isinstance(parameters, torch.Tensor):
parameters = [parameters] parameters = [parameters]
...@@ -80,13 +85,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): ...@@ -80,13 +85,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
# <<< # <<<
# >>> # >>>
# pax(0, { # if ITERATION == DEBUG_ITERATION:
# "[LOC]" : "[** BEFORE CALC NORM **]", # pax(0, {
# "max_norm" : max_norm, # "[LOC]" : "[** BEFORE CALC NORM **]",
# "parameters" : parameters, # "[ITERATION]" : ITERATION,
# "grads" : grads, # "max_norm" : max_norm,
# "grads_for_norm" : grads_for_norm, # "parameters" : parameters,
# }) # # "grads" : grads,
# "grads_for_norm" : grads_for_norm,
# })
# <<< # <<<
# Norm parameters. # Norm parameters.
...@@ -133,34 +140,42 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): ...@@ -133,34 +140,42 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
total_norm += grad_norm ** norm_type total_norm += grad_norm ** norm_type
# >>> # >>>
# pax(0, { # if ITERATION == DEBUG_ITERATION:
# "[LOC]" : "[** CALC NORM **]", # pax(0, {
# "max_norm" : max_norm, # "[LOC]" : "[** CALC NORM **]",
# "norm_type" : norm_type, # "[ITERATION]" : ITERATION,
# "grad_norm" : tp(grad_norm), # "max_norm" : max_norm,
# "total_norm" : tp(total_norm), # "norm_type" : norm_type,
# }) # "grad_norm" : tp(grad_norm),
# "total_norm" : tp(total_norm),
# })
# <<< # <<<
# Sum across all model-parallel GPUs. # Sum across all model-parallel GPUs.
# >>> # >>>
# torch.distributed.all_reduce(total_norm, from megatron import get_args
# op=torch.distributed.ReduceOp.SUM, args = get_args()
# group=mpu.get_model_parallel_group()) if not args.use_distributed_optimizer:
torch.distributed.all_reduce(total_norm,
op=torch.distributed.ReduceOp.SUM,
group=mpu.get_model_parallel_group())
# +++ # +++
torch.distributed.all_reduce(total_norm, else:
op=torch.distributed.ReduceOp.SUM) torch.distributed.all_reduce(total_norm,
op=torch.distributed.ReduceOp.SUM)
# <<< # <<<
total_norm = total_norm.item() ** (1.0 / norm_type) total_norm = total_norm.item() ** (1.0 / norm_type)
# >>> # >>>
# pax(1, { # if ITERATION == DEBUG_ITERATION:
# "[LOC]" : "[** CALC NORM **]", # pax(0, {
# "max_norm" : max_norm, # "[LOC]" : "[** AFTER REDUCE. **]",
# "norm_type" : norm_type, # "[ITERATION]" : ITERATION,
# "grad_norm" : tp(grad_norm), # "max_norm" : max_norm,
# "total_norm" : tp(total_norm), # "norm_type" : norm_type,
# }) # "grad_norm" : grad_norm.item(),
# "total_norm" : total_norm,
# })
# <<< # <<<
# Scale. # Scale.
...@@ -172,6 +187,18 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2): ...@@ -172,6 +187,18 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
[grads, grads], [grads, grads],
clip_coeff) clip_coeff)
# >>>
# # from pygit2 import Repository
# if ITERATION == DEBUG_ITERATION:
# pax(1, {
# "[LOC]" : "[** CLIP / FINAL **]",
# "[ITERATION]" : ITERATION,
# "grads" : grads,
# "clip_coeff" : tp(clip_coeff),
# # "repo" : Repository('.').head.shorthand,
# })
# <<<
return total_norm return total_norm
......
...@@ -32,7 +32,7 @@ from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32 ...@@ -32,7 +32,7 @@ from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
# >>> # >>>
from lutil import pax, tp from lutil import pax, tp
DEBUG_ITERATION = 1 # 10 DEBUG_ITERATION = 0 # 10
# <<< # <<<
...@@ -98,16 +98,12 @@ class MegatronOptimizer(ABC): ...@@ -98,16 +98,12 @@ class MegatronOptimizer(ABC):
return params return params
def clip_grad_norm(self, clip_grad): def clip_grad_norm(self, clip_grad, ITERATION):
params = self.get_parameters()
# >>> # >>>
# pax(0, { return
# "clip_grad" : clip_grad,
# # "params": [ (p.tensor_model_parallel, tp(p)) for p in params ],
# "grads" : [ p.grad for p in params ],
# })
# <<< # <<<
return clip_grad_norm_fp32(params, clip_grad) params = self.get_parameters()
return clip_grad_norm_fp32(params, clip_grad, ITERATION = ITERATION)
def count_zeros(self): def count_zeros(self):
...@@ -267,6 +263,73 @@ class BaseFloat16Optimizer(MegatronOptimizer): ...@@ -267,6 +263,73 @@ class BaseFloat16Optimizer(MegatronOptimizer):
return found_inf_flag return found_inf_flag
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
@classmethod
def debug_general(cls, ITERATION, key, value):
from megatron import get_args
args = get_args()
my_rank = torch.distributed.get_rank()
if ITERATION != DEBUG_ITERATION:
return
for r in range(torch.distributed.get_world_size()):
if my_rank == r:
print(" + %4s; [r%d]; %s, %.12e." % ("fix" if args.use_distributed_optimizer else "main", my_rank, key, value))
torch.distributed.barrier()
torch.distributed.barrier()
# if my_rank == 0:
# raise Exception("debug.")
# else:
# exit(0)
exit(0)
def _debug_main(self, ITERATION, key0, key1, f, ff):
count = sum(
p.nelement()
for g in self.optimizer.param_groups
for p in g["params"]
)
return self.debug_general(
ITERATION,
"main/%s, %s [count %d]" % (key1, key0, count),
sum(ff(f(p))
for g in self.optimizer.param_groups
for p in g["params"]).item() / count,
)
# def debug_main_param_mean(self, ITERATION, key):
# return self._debug_main(
# ITERATION,
# key,
# "param mean",
# lambda p : p,
# torch.mean,
# )
def debug_main_param_sum(self, ITERATION, key):
return self._debug_main(
ITERATION,
key,
"param sum",
# lambda p : p,
lambda p : torch.abs(p),
torch.sum,
)
# def debug_main_grad_mean(self, ITERATION, key):
# return self._debug_main(
# ITERATION,
# key,
# "grad mean",
# lambda p : p.grad,
# torch.mean,
# )
def debug_main_grad_sum(self, ITERATION, key):
return self._debug_main(
ITERATION,
key,
"grad sum",
# lambda p : p.grad,
lambda p : torch.abs(p.grad),
torch.sum,
)
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@torch.no_grad() @torch.no_grad()
def step(self, ITERATION): def step(self, ITERATION):
...@@ -279,18 +342,10 @@ class BaseFloat16Optimizer(MegatronOptimizer): ...@@ -279,18 +342,10 @@ class BaseFloat16Optimizer(MegatronOptimizer):
timers('optimizer-copy-to-main-grad').stop() timers('optimizer-copy-to-main-grad').stop()
# >>> # >>>
# pax(0, { # self.debug_main_param_sum(ITERATION)
# "[LOC]" : "[** BEFORE UNSCALE **]", # self.debug_main_grad_sum(ITERATION)
# "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
# "param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
# })
# <<< # <<<
# pax(0, {
# "params" : self.get_parameters(), # self.main_param_shards,
# "grads" : [ p.grad for p in self.get_parameters() ], # self.main_param_shards ],
# })
# Do unscale, check for inf, and update grad scaler only for # Do unscale, check for inf, and update grad scaler only for
# the case that grad scaler is provided. # the case that grad scaler is provided.
if self.grad_scaler: if self.grad_scaler:
...@@ -313,56 +368,33 @@ class BaseFloat16Optimizer(MegatronOptimizer): ...@@ -313,56 +368,33 @@ class BaseFloat16Optimizer(MegatronOptimizer):
}) })
return False, None, None return False, None, None
# >>>
# pax(0, {
# "[LOC]" : "[** BEFORE CLIP **]",
# "clip_grad" : self.clip_grad,
# # "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
# "param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
# })
# <<<
# Clip the main gradients. # Clip the main gradients.
timers('optimizer-clip-main-grad').start() timers('optimizer-clip-main-grad').start()
grad_norm = None grad_norm = None
if self.clip_grad > 0.0: if self.clip_grad > 0.0:
grad_norm = self.clip_grad_norm(self.clip_grad) grad_norm = self.clip_grad_norm(self.clip_grad, ITERATION)
timers('optimizer-clip-main-grad').stop() timers('optimizer-clip-main-grad').stop()
# >>>
pax(1, {
"[LOC]" : "[** BEFORE NONZERO **]",
# "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
"param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
})
# <<<
# count the zeros in the grads # count the zeros in the grads
num_zeros_in_grad = self.count_zeros() if \ num_zeros_in_grad = self.count_zeros() if \
self.log_num_zeros_in_grad else None self.log_num_zeros_in_grad else None
# >>>
pax(0, {
# "main params" : self.get_main_params(),
# "main grads" : self.get_main_grads(),
**{"param_groups / %d" % i : g for i, g in enumerate(self.optimizer.param_groups)},
"param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
})
# <<<
# Step the optimizer. # Step the optimizer.
self.optimizer.step() self.optimizer.step()
# >>>
# self.debug_main_param_sum(ITERATION, "after step.")
self.debug_main_grad_sum(ITERATION, "after step.")
# <<<
# Update params from main params. # Update params from main params.
timers('optimizer-copy-main-to-model-params').start() timers('optimizer-copy-main-to-model-params').start()
self._copy_main_params_to_model_params(ITERATION) self._copy_main_params_to_model_params(ITERATION)
timers('optimizer-copy-main-to-model-params').stop() timers('optimizer-copy-main-to-model-params').stop()
# >>> # >>>
# pax(1, { self.debug_main_param_sum(ITERATION, "after copy param.")
# "ITERATION" : ITERATION, self.debug_main_grad_sum(ITERATION, "after copy param.")
# "model_params" : [ p for m in self.models for p in m.parameters() ],
# })
# <<< # <<<
# Successful update. # Successful update.
...@@ -674,12 +706,12 @@ class Float16OptimizerWithFloat16Params(BaseFloat16Optimizer): ...@@ -674,12 +706,12 @@ class Float16OptimizerWithFloat16Params(BaseFloat16Optimizer):
_multi_tensor_copy_this_to_that(this=main_data, that=model_data, _multi_tensor_copy_this_to_that(this=main_data, that=model_data,
overflow_buf=self._dummy_overflow_buf) overflow_buf=self._dummy_overflow_buf)
# >>> # >>>
if ITERATION == DEBUG_ITERATION: # if ITERATION == DEBUG_ITERATION:
pax(0, { # pax(0, {
"** branch **" : "** main. **", # "** branch **" : "** main. **",
"ITERATION" : ITERATION, # "ITERATION" : ITERATION,
"model params" : [p for m in self.models for p in m.parameters()], # "model params" : [p for m in self.models for p in m.parameters()],
}) # })
# <<< # <<<
...@@ -1370,12 +1402,12 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer): ...@@ -1370,12 +1402,12 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
# }) # })
# >>> # >>>
if ITERATION == DEBUG_ITERATION: # if ITERATION == DEBUG_ITERATION:
pax(0, { # pax(0, {
"** branch **" : "** fix. **", # "** branch **" : "** fix. **",
"ITERATION" : ITERATION, # "ITERATION" : ITERATION,
"model params" : self.get_world_model_params(), # "model params" : self.get_world_model_params(),
}) # })
# <<< # <<<
# <<< # <<<
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment