Commit 67e23459 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

more debugging; some cleanup

parent a58653dc
......@@ -154,7 +154,7 @@ def get_megatron_optimizer(model,
opt_ty = Float16DistributedOptimizer \
if args.use_distributed_optimizer \
else Float16OptimizerWithFloat16Params
return opt_ty(optimizer,
opt = opt_ty(optimizer,
args.clip_grad,
args.log_num_zeros_in_grad,
params_have_main_grad,
......@@ -162,6 +162,11 @@ def get_megatron_optimizer(model,
args.bf16,
grad_scaler,
model)
# >>>
# opt.debug_main_param_sum(0, "after init")
# opt.debug_main_grad_sum(0, "after init")
# <<<
return opt
# <<<
# FP32.
......
......@@ -28,9 +28,10 @@ from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
# >>>
from lutil import pax, tp
DEBUG_ITERATION = 1
# <<<
def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
def clip_grad_norm_fp32(parameters, max_norm, norm_type=2, ITERATION=None):
"""Clips gradient norm of an iterable of parameters whose gradients
are in fp32.
......@@ -49,6 +50,10 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
Total norm of the parameters (viewed as a single vector).
"""
# >>>
raise Exception("currently debugging ... don't call me.")
# <<<
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
......@@ -80,13 +85,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
# <<<
# >>>
# pax(0, {
# "[LOC]" : "[** BEFORE CALC NORM **]",
# "max_norm" : max_norm,
# "parameters" : parameters,
# "grads" : grads,
# "grads_for_norm" : grads_for_norm,
# })
# if ITERATION == DEBUG_ITERATION:
# pax(0, {
# "[LOC]" : "[** BEFORE CALC NORM **]",
# "[ITERATION]" : ITERATION,
# "max_norm" : max_norm,
# "parameters" : parameters,
# # "grads" : grads,
# "grads_for_norm" : grads_for_norm,
# })
# <<<
# Norm parameters.
......@@ -133,34 +140,42 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
total_norm += grad_norm ** norm_type
# >>>
# pax(0, {
# "[LOC]" : "[** CALC NORM **]",
# "max_norm" : max_norm,
# "norm_type" : norm_type,
# "grad_norm" : tp(grad_norm),
# "total_norm" : tp(total_norm),
# })
# if ITERATION == DEBUG_ITERATION:
# pax(0, {
# "[LOC]" : "[** CALC NORM **]",
# "[ITERATION]" : ITERATION,
# "max_norm" : max_norm,
# "norm_type" : norm_type,
# "grad_norm" : tp(grad_norm),
# "total_norm" : tp(total_norm),
# })
# <<<
# Sum across all model-parallel GPUs.
# >>>
# torch.distributed.all_reduce(total_norm,
# op=torch.distributed.ReduceOp.SUM,
# group=mpu.get_model_parallel_group())
from megatron import get_args
args = get_args()
if not args.use_distributed_optimizer:
torch.distributed.all_reduce(total_norm,
op=torch.distributed.ReduceOp.SUM,
group=mpu.get_model_parallel_group())
# +++
torch.distributed.all_reduce(total_norm,
op=torch.distributed.ReduceOp.SUM)
else:
torch.distributed.all_reduce(total_norm,
op=torch.distributed.ReduceOp.SUM)
# <<<
total_norm = total_norm.item() ** (1.0 / norm_type)
# >>>
# pax(1, {
# "[LOC]" : "[** CALC NORM **]",
# "max_norm" : max_norm,
# "norm_type" : norm_type,
# "grad_norm" : tp(grad_norm),
# "total_norm" : tp(total_norm),
# })
# if ITERATION == DEBUG_ITERATION:
# pax(0, {
# "[LOC]" : "[** AFTER REDUCE. **]",
# "[ITERATION]" : ITERATION,
# "max_norm" : max_norm,
# "norm_type" : norm_type,
# "grad_norm" : grad_norm.item(),
# "total_norm" : total_norm,
# })
# <<<
# Scale.
......@@ -172,6 +187,18 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
[grads, grads],
clip_coeff)
# >>>
# # from pygit2 import Repository
# if ITERATION == DEBUG_ITERATION:
# pax(1, {
# "[LOC]" : "[** CLIP / FINAL **]",
# "[ITERATION]" : ITERATION,
# "grads" : grads,
# "clip_coeff" : tp(clip_coeff),
# # "repo" : Repository('.').head.shorthand,
# })
# <<<
return total_norm
......
......@@ -32,7 +32,7 @@ from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
# >>>
from lutil import pax, tp
DEBUG_ITERATION = 1 # 10
DEBUG_ITERATION = 0 # 10
# <<<
......@@ -98,16 +98,12 @@ class MegatronOptimizer(ABC):
return params
def clip_grad_norm(self, clip_grad):
params = self.get_parameters()
def clip_grad_norm(self, clip_grad, ITERATION):
# >>>
# pax(0, {
# "clip_grad" : clip_grad,
# # "params": [ (p.tensor_model_parallel, tp(p)) for p in params ],
# "grads" : [ p.grad for p in params ],
# })
return
# <<<
return clip_grad_norm_fp32(params, clip_grad)
params = self.get_parameters()
return clip_grad_norm_fp32(params, clip_grad, ITERATION = ITERATION)
def count_zeros(self):
......@@ -267,6 +263,73 @@ class BaseFloat16Optimizer(MegatronOptimizer):
return found_inf_flag
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
@classmethod
def debug_general(cls, ITERATION, key, value):
from megatron import get_args
args = get_args()
my_rank = torch.distributed.get_rank()
if ITERATION != DEBUG_ITERATION:
return
for r in range(torch.distributed.get_world_size()):
if my_rank == r:
print(" + %4s; [r%d]; %s, %.12e." % ("fix" if args.use_distributed_optimizer else "main", my_rank, key, value))
torch.distributed.barrier()
torch.distributed.barrier()
# if my_rank == 0:
# raise Exception("debug.")
# else:
# exit(0)
exit(0)
def _debug_main(self, ITERATION, key0, key1, f, ff):
count = sum(
p.nelement()
for g in self.optimizer.param_groups
for p in g["params"]
)
return self.debug_general(
ITERATION,
"main/%s, %s [count %d]" % (key1, key0, count),
sum(ff(f(p))
for g in self.optimizer.param_groups
for p in g["params"]).item() / count,
)
# def debug_main_param_mean(self, ITERATION, key):
# return self._debug_main(
# ITERATION,
# key,
# "param mean",
# lambda p : p,
# torch.mean,
# )
def debug_main_param_sum(self, ITERATION, key):
return self._debug_main(
ITERATION,
key,
"param sum",
# lambda p : p,
lambda p : torch.abs(p),
torch.sum,
)
# def debug_main_grad_mean(self, ITERATION, key):
# return self._debug_main(
# ITERATION,
# key,
# "grad mean",
# lambda p : p.grad,
# torch.mean,
# )
def debug_main_grad_sum(self, ITERATION, key):
return self._debug_main(
ITERATION,
key,
"grad sum",
# lambda p : p.grad,
lambda p : torch.abs(p.grad),
torch.sum,
)
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@torch.no_grad()
def step(self, ITERATION):
......@@ -279,18 +342,10 @@ class BaseFloat16Optimizer(MegatronOptimizer):
timers('optimizer-copy-to-main-grad').stop()
# >>>
# pax(0, {
# "[LOC]" : "[** BEFORE UNSCALE **]",
# "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
# "param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
# })
# self.debug_main_param_sum(ITERATION)
# self.debug_main_grad_sum(ITERATION)
# <<<
# pax(0, {
# "params" : self.get_parameters(), # self.main_param_shards,
# "grads" : [ p.grad for p in self.get_parameters() ], # self.main_param_shards ],
# })
# Do unscale, check for inf, and update grad scaler only for
# the case that grad scaler is provided.
if self.grad_scaler:
......@@ -313,56 +368,33 @@ class BaseFloat16Optimizer(MegatronOptimizer):
})
return False, None, None
# >>>
# pax(0, {
# "[LOC]" : "[** BEFORE CLIP **]",
# "clip_grad" : self.clip_grad,
# # "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
# "param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
# })
# <<<
# Clip the main gradients.
timers('optimizer-clip-main-grad').start()
grad_norm = None
if self.clip_grad > 0.0:
grad_norm = self.clip_grad_norm(self.clip_grad)
grad_norm = self.clip_grad_norm(self.clip_grad, ITERATION)
timers('optimizer-clip-main-grad').stop()
# >>>
pax(1, {
"[LOC]" : "[** BEFORE NONZERO **]",
# "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
"param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
})
# <<<
# count the zeros in the grads
num_zeros_in_grad = self.count_zeros() if \
self.log_num_zeros_in_grad else None
# >>>
pax(0, {
# "main params" : self.get_main_params(),
# "main grads" : self.get_main_grads(),
**{"param_groups / %d" % i : g for i, g in enumerate(self.optimizer.param_groups)},
"param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
})
# <<<
# Step the optimizer.
self.optimizer.step()
# >>>
# self.debug_main_param_sum(ITERATION, "after step.")
self.debug_main_grad_sum(ITERATION, "after step.")
# <<<
# Update params from main params.
timers('optimizer-copy-main-to-model-params').start()
self._copy_main_params_to_model_params(ITERATION)
timers('optimizer-copy-main-to-model-params').stop()
# >>>
# pax(1, {
# "ITERATION" : ITERATION,
# "model_params" : [ p for m in self.models for p in m.parameters() ],
# })
self.debug_main_param_sum(ITERATION, "after copy param.")
self.debug_main_grad_sum(ITERATION, "after copy param.")
# <<<
# Successful update.
......@@ -674,12 +706,12 @@ class Float16OptimizerWithFloat16Params(BaseFloat16Optimizer):
_multi_tensor_copy_this_to_that(this=main_data, that=model_data,
overflow_buf=self._dummy_overflow_buf)
# >>>
if ITERATION == DEBUG_ITERATION:
pax(0, {
"** branch **" : "** main. **",
"ITERATION" : ITERATION,
"model params" : [p for m in self.models for p in m.parameters()],
})
# if ITERATION == DEBUG_ITERATION:
# pax(0, {
# "** branch **" : "** main. **",
# "ITERATION" : ITERATION,
# "model params" : [p for m in self.models for p in m.parameters()],
# })
# <<<
......@@ -1370,12 +1402,12 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
# })
# >>>
if ITERATION == DEBUG_ITERATION:
pax(0, {
"** branch **" : "** fix. **",
"ITERATION" : ITERATION,
"model params" : self.get_world_model_params(),
})
# if ITERATION == DEBUG_ITERATION:
# pax(0, {
# "** branch **" : "** fix. **",
# "ITERATION" : ITERATION,
# "model params" : self.get_world_model_params(),
# })
# <<<
# <<<
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment