Unverified Commit 25e651a2 authored by zhoutang776's avatar zhoutang776 Committed by GitHub
Browse files

Update run_translation_no_trainer.py (#18637)

* Update run_translation_no_trainer.py

found an error in selecting `no_decay` parameters and some small modifications when the user continues to train from a checkpoint

* fixs `no_decay` and `resume_step` issue

1. change `no_decay` list
2. if use continue to train their model from provided checkpoint, the `resume_step` will not be initialized properly if `args.gradient_accumulation_steps != 1`
parent a27195b1
...@@ -464,7 +464,7 @@ def main(): ...@@ -464,7 +464,7 @@ def main():
# Optimizer # Optimizer
# Split weights in two groups, one with weight decay and the other not. # Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"] no_decay = ["bias", "layer_norm.weight"]
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
{ {
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
...@@ -558,10 +558,15 @@ def main(): ...@@ -558,10 +558,15 @@ def main():
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
# update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
completed_steps = starting_epoch * num_update_steps_per_epoch
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
...@@ -570,6 +575,8 @@ def main(): ...@@ -570,6 +575,8 @@ def main():
# We need to skip steps until we reach the resumed step # We need to skip steps until we reach the resumed step
if args.resume_from_checkpoint and epoch == starting_epoch: if args.resume_from_checkpoint and epoch == starting_epoch:
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
if step % args.gradient_accumulation_steps == 0:
progress_bar.update(1)
completed_steps += 1 completed_steps += 1
continue continue
......
...@@ -602,10 +602,15 @@ def main(): ...@@ -602,10 +602,15 @@ def main():
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
# update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
completed_steps = starting_epoch * num_update_steps_per_epoch
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
...@@ -614,6 +619,8 @@ def main(): ...@@ -614,6 +619,8 @@ def main():
# We need to skip steps until we reach the resumed step # We need to skip steps until we reach the resumed step
if args.resume_from_checkpoint and epoch == starting_epoch: if args.resume_from_checkpoint and epoch == starting_epoch:
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
if step % args.gradient_accumulation_steps == 0:
progress_bar.update(1)
completed_steps += 1 completed_steps += 1
continue continue
......
...@@ -510,7 +510,7 @@ def main(): ...@@ -510,7 +510,7 @@ def main():
# Optimizer # Optimizer
# Split weights in two groups, one with weight decay and the other not. # Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"] no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
{ {
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
...@@ -607,10 +607,15 @@ def main(): ...@@ -607,10 +607,15 @@ def main():
starting_epoch = int(training_difference.replace("epoch_", "")) + 1 starting_epoch = int(training_difference.replace("epoch_", "")) + 1
resume_step = None resume_step = None
else: else:
resume_step = int(training_difference.replace("step_", "")) # need to multiply `gradient_accumulation_steps` to reflect real steps
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
starting_epoch = resume_step // len(train_dataloader) starting_epoch = resume_step // len(train_dataloader)
resume_step -= starting_epoch * len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader)
# update the progress_bar if load from checkpoint
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
completed_steps = starting_epoch * num_update_steps_per_epoch
for epoch in range(starting_epoch, args.num_train_epochs): for epoch in range(starting_epoch, args.num_train_epochs):
model.train() model.train()
if args.with_tracking: if args.with_tracking:
...@@ -619,6 +624,8 @@ def main(): ...@@ -619,6 +624,8 @@ def main():
# We need to skip steps until we reach the resumed step # We need to skip steps until we reach the resumed step
if args.resume_from_checkpoint and epoch == starting_epoch: if args.resume_from_checkpoint and epoch == starting_epoch:
if resume_step is not None and step < resume_step: if resume_step is not None and step < resume_step:
if step % args.gradient_accumulation_steps == 0:
progress_bar.update(1)
completed_steps += 1 completed_steps += 1
continue continue
outputs = model(**batch) outputs = model(**batch)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment