Commit 087cbff1 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

removed debug_base/main/model methods.

parent be8de1b3
...@@ -99,11 +99,6 @@ def clip_grad_norm_fp32(parameters, grads_for_norm, ...@@ -99,11 +99,6 @@ def clip_grad_norm_fp32(parameters, grads_for_norm,
group=model_parallel_group) group=model_parallel_group)
total_norm = total_norm.item() ** (1.0 / norm_type) total_norm = total_norm.item() ** (1.0 / norm_type)
# >>>
# from lutil import pax, tp, print_seq
# print_seq("norm : grad %s, total %s." % (grad_norm.item(), total_norm))
# <<<
# Scale. # Scale.
clip_coeff = max_norm / (total_norm + 1.0e-6) clip_coeff = max_norm / (total_norm + 1.0e-6)
if clip_coeff < 1.0: if clip_coeff < 1.0:
......
...@@ -27,10 +27,6 @@ from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate ...@@ -27,10 +27,6 @@ from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate
from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper
# >>>
from lutil import pax, tp, print_seq
# <<<
class Range: class Range:
...@@ -363,14 +359,6 @@ class DistributedOptimizer(MixedPrecisionOptimizer): ...@@ -363,14 +359,6 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
fp32_from_float16_groups as a memory optimization to reduce fp32_from_float16_groups as a memory optimization to reduce
fragmentation; in the case of set_to_none==True, the space fragmentation; in the case of set_to_none==True, the space
used by this field can be safely deallocated at this point.""" used by this field can be safely deallocated at this point."""
# >>>
# params = [ p for g in self.shard_fp32_groups for p in g ]
# pax(0, {
# "shard_fp32_groups" : self.shard_fp32_groups,
# "params" : params,
# "grads" : [ p.grad for p in params ],
# })
# <<<
for groups in ( for groups in (
self.full_float16_groups, self.full_float16_groups,
self.full_fp32_groups, self.full_fp32_groups,
......
...@@ -33,10 +33,6 @@ from megatron.utils import unwrap_model ...@@ -33,10 +33,6 @@ from megatron.utils import unwrap_model
from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32 from .clip_grads import clip_grad_norm_fp32, count_zeros_fp32
# >>>
from lutil import pax, tp, print_seq
# <<<
def _zero_grad_group_helper(group, set_to_none): def _zero_grad_group_helper(group, set_to_none):
"""Zero out the gradient for a group of parameters. """Zero out the gradient for a group of parameters.
...@@ -349,63 +345,6 @@ class MixedPrecisionOptimizer(MegatronOptimizer): ...@@ -349,63 +345,6 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
return found_inf_flag return found_inf_flag
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# @classmethod
# def debug_base(cls, ITERATION, key, value):
# from megatron import get_args
# args = get_args()
# my_rank = torch.distributed.get_rank()
# DEBUG_ITERATION = ITERATION
# if ITERATION != DEBUG_ITERATION:
# return
# for r in range(torch.distributed.get_world_size()):
# if my_rank == r:
# # prefix = " + "
# prefix = ""
# print("%sbr/%s; [r%d, i%d]; %s, %.12e" % (prefix, "fix " if args.use_distributed_optimizer else "main", my_rank, ITERATION, key, value))
# torch.distributed.barrier()
# torch.distributed.barrier()
# # if my_rank == 0:
# # raise Exception("debug.")
# # else:
# # exit(0)
# exit(0)
# def debug_model(self, ITERATION, key, use_grad):
# use_grad = bool(use_grad)
# tensors = [
# (p.main_grad.float() if use_grad else p.float())
# for m in self.models for p in m.parameters()
# ]
# count = sum(t.nelement() for t in tensors)
# return self.debug_base(
# ITERATION,
# "model/%s, %s [count %d]" % (
# "grad" if use_grad else "param",
# key,
# count,
# ),
# # sum(torch.sum(torch.abs(t)) for t in tensors).item() / count,
# sum(torch.sum(torch.abs(t)) for t in tensors),
# )
# def debug_main(self, ITERATION, key, use_grad):
# use_grad = bool(use_grad)
# tensors = [
# p.grad if use_grad else p
# for g in self.optimizer.param_groups
# for p in g["params"]
# ]
# tensors = [ t.float() for t in tensors ]
# count = sum(t.nelement() for t in tensors)
# return self.debug_base(
# ITERATION,
# "main/%s, %s [count %d]" % (
# "grad" if use_grad else "param",
# key,
# count,
# ),
# sum(torch.sum(torch.abs(t)) for t in tensors),
# )
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
@torch.no_grad() @torch.no_grad()
def step(self, args, timers): def step(self, args, timers):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment