Commit a58653dc authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

clip grad reduce across world. [ not just model group ]

parent 371a8828
......@@ -79,10 +79,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
# })
# <<<
# pax(1, {
# >>>
# pax(0, {
# "[LOC]" : "[** BEFORE CALC NORM **]",
# "max_norm" : max_norm,
# "parameters" : parameters,
# "grads" : grads,
# "grads_for_norm" : grads_for_norm,
# })
# <<<
# Norm parameters.
max_norm = float(max_norm)
......@@ -127,12 +132,37 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
grad_norm = torch.norm(grad, norm_type)
total_norm += grad_norm ** norm_type
# >>>
# pax(0, {
# "[LOC]" : "[** CALC NORM **]",
# "max_norm" : max_norm,
# "norm_type" : norm_type,
# "grad_norm" : tp(grad_norm),
# "total_norm" : tp(total_norm),
# })
# <<<
# Sum across all model-parallel GPUs.
# >>>
# torch.distributed.all_reduce(total_norm,
# op=torch.distributed.ReduceOp.SUM,
# group=mpu.get_model_parallel_group())
# +++
torch.distributed.all_reduce(total_norm,
op=torch.distributed.ReduceOp.SUM,
group=mpu.get_model_parallel_group())
op=torch.distributed.ReduceOp.SUM)
# <<<
total_norm = total_norm.item() ** (1.0 / norm_type)
# >>>
# pax(1, {
# "[LOC]" : "[** CALC NORM **]",
# "max_norm" : max_norm,
# "norm_type" : norm_type,
# "grad_norm" : tp(grad_norm),
# "total_norm" : tp(total_norm),
# })
# <<<
# Scale.
clip_coeff = max_norm / (total_norm + 1.0e-6)
if clip_coeff < 1.0:
......
......@@ -103,7 +103,7 @@ class MegatronOptimizer(ABC):
# >>>
# pax(0, {
# "clip_grad" : clip_grad,
# "params": [ (p.tensor_model_parallel, tp(p)) for p in params ],
# # "params": [ (p.tensor_model_parallel, tp(p)) for p in params ],
# "grads" : [ p.grad for p in params ],
# })
# <<<
......@@ -314,11 +314,12 @@ class BaseFloat16Optimizer(MegatronOptimizer):
return False, None, None
# >>>
pax(0, {
"[LOC]" : "[** BEFORE CLIP **]",
"param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
"param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
})
# pax(0, {
# "[LOC]" : "[** BEFORE CLIP **]",
# "clip_grad" : self.clip_grad,
# # "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
# "param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
# })
# <<<
# Clip the main gradients.
......@@ -328,6 +329,14 @@ class BaseFloat16Optimizer(MegatronOptimizer):
grad_norm = self.clip_grad_norm(self.clip_grad)
timers('optimizer-clip-main-grad').stop()
# >>>
pax(1, {
"[LOC]" : "[** BEFORE NONZERO **]",
# "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
"param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
})
# <<<
# count the zeros in the grads
num_zeros_in_grad = self.count_zeros() if \
self.log_num_zeros_in_grad else None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment