Commit 0aff3629 authored by Rewon Child's avatar Rewon Child
Browse files

Update argument names and fix merge error

parent 41a64613
......@@ -308,7 +308,7 @@ def _add_logging_args(parser):
group.add_argument('--log-params-norm', action='store_true',
help='If set, calculate and log parameters norm.')
group.add_argument('--log-zeros', action='store_true',
group.add_argument('--log-num-zeros-in-grad', action='store_true',
help='If set, calculate and log the number of zeros in gradient.')
group.add_argument('--tensorboard-log-interval', type=int, default=1,
help='Report to tensorboard interval.')
......
......@@ -84,7 +84,7 @@ def get_megatron_optimizer(model):
hysteresis=args.hysteresis)
# Megatron optimizer.
return FP16OptimizerWithFP16Params(optimizer, grad_scaler,
args.clip_grad, args.log_zeros)
args.clip_grad, args.log_num_zeros_in_grad)
# FP32.
return FP32Optimizer(optimizer, args.clip_grad, args.log_zeros)
return FP32Optimizer(optimizer, args.clip_grad, args.log_num_zeros_in_grad)
......@@ -139,12 +139,12 @@ class MegatronOptimizer(ABC):
class FP16OptimizerWithFP16Params(MegatronOptimizer):
def __init__(self, optimizer, grad_scaler, clip_grad, log_zeros):
def __init__(self, optimizer, grad_scaler, clip_grad, log_num_zeros_in_grad):
super(FP16OptimizerWithFP16Params, self).__init__(optimizer)
self.grad_scaler = grad_scaler
self.clip_grad = clip_grad
self.log_zeros = log_zeros
self.log_num_zeros_in_grad = log_num_zeros_in_grad
# Tensor used to determine if a nan/if has happend.
# Any non-zero value indicates inf/nan.
......@@ -329,7 +329,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
timers('optimizer-clip-main-grad').stop()
# count the zeros in the grads
num_zeros = self.count_zeros() if self.log_zeros else None
num_zeros_in_grad = self.count_zeros() if self.log_num_zeros_in_grad else None
# Step the optimizer.
self.optimizer.step()
......@@ -340,7 +340,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
timers('optimizer-copy-main-to-model-params').stop()
# Successful update.
return True, grad_norm, num_zeros
return True, grad_norm, num_zeros_in_grad
def state_dict(self):
......@@ -381,11 +381,11 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
class FP32Optimizer(MegatronOptimizer):
def __init__(self, optimizer, clip_grad, log_zeros):
def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad):
super(FP32Optimizer, self).__init__(optimizer)
self.clip_grad = clip_grad
self.log_zeros = log_zeros
self.log_num_zeros_in_grad = log_num_zeros_in_grad
self._scale = torch.cuda.FloatTensor([1.0])
......@@ -411,13 +411,13 @@ class FP32Optimizer(MegatronOptimizer):
grad_norm = self.clip_grad_norm(self.clip_grad)
# count the zeros in the grads
num_zeros = self.count_zeros() if self.log_zeros else None
num_zeros_in_grad = self.count_zeros() if self.log_num_zeros_in_grad else None
# Update parameters.
self.optimizer.step()
# No overflow for FP32 optimizer.
return True, grad_norm, num_zeros
return True, grad_norm, num_zeros_in_grad
def reload_model_params(self):
......
......@@ -378,11 +378,7 @@ def train_step(forward_step_func, data_iterator,
# Update parameters.
timers('optimizer').start()
<<<<<<< HEAD
update_successfull, grad_norm, num_zeros = optimizer.step()
=======
update_successful, grad_norm = optimizer.step()
>>>>>>> main
update_successful, grad_norm, num_zeros_in_grad = optimizer.step()
timers('optimizer').stop()
# Update learning rate.
......@@ -401,13 +397,13 @@ def train_step(forward_step_func, data_iterator,
for key in losses_reduced[0]:
losses_reduced_for_key = [x[key] for x in losses_reduced]
loss_reduced[key] = sum(losses_reduced_for_key) / len(losses_reduced_for_key)
return loss_reduced, skipped_iter, grad_norm, num_zeros
return {}, skipped_iter, grad_norm, num_zeros
return loss_reduced, skipped_iter, grad_norm, num_zeros_in_grad
return {}, skipped_iter, grad_norm, num_zeros_in_grad
def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
loss_scale, report_memory_flag, skipped_iter,
grad_norm, params_norm, num_zeros):
grad_norm, params_norm, num_zeros_in_grad):
"""Log training information such as losses, timing, ...."""
args = get_args()
timers = get_timers()
......@@ -496,9 +492,9 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
writer.add_scalar('grad-norm', grad_norm, iteration)
writer.add_scalar('grad-norm vs samples', grad_norm,
args.consumed_train_samples)
if num_zeros is not None:
writer.add_scalar('num-zeros', num_zeros, iteration)
writer.add_scalar('num-zeros vs samples', num_zeros,
if num_zeros_in_grad is not None:
writer.add_scalar('num-zeros', num_zeros_in_grad, iteration)
writer.add_scalar('num-zeros vs samples', num_zeros_in_grad,
args.consumed_train_samples)
if params_norm is not None:
writer.add_scalar('params-norm', params_norm, iteration)
......@@ -534,8 +530,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
log_string += ' loss scale: {:.1f} |'.format(loss_scale)
if grad_norm is not None:
log_string += ' grad norm: {:.3f} |'.format(grad_norm)
if num_zeros is not None:
log_string += ' num zeros: {:.1f} |'.format(num_zeros)
if num_zeros_in_grad is not None:
log_string += ' num zeros: {:.1f} |'.format(num_zeros_in_grad)
if params_norm is not None:
log_string += ' params norm: {:.3f} |'.format(params_norm)
log_string += ' number of skipped iterations: {:3d} |'.format(
......@@ -591,11 +587,12 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
report_memory_flag = True
while iteration < args.train_iters:
update_num_microbatches(args.consumed_train_samples)
loss_dict, skipped_iter, grad_norm, num_zeros = train_step(forward_step_func,
train_data_iterator,
model,
optimizer,
lr_scheduler)
loss_dict, skipped_iter, grad_norm, num_zeros_in_grad = \
train_step(forward_step_func,
train_data_iterator,
model,
optimizer,
lr_scheduler)
iteration += 1
args.consumed_train_samples += mpu.get_data_parallel_world_size() * \
args.micro_batch_size * \
......@@ -610,7 +607,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
optimizer.param_groups[0]['lr'],
iteration, loss_scale,
report_memory_flag, skipped_iter,
grad_norm, params_norm, num_zeros)
grad_norm, params_norm, num_zeros_in_grad)
# Autoresume
if args.adlr_autoresume and \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment