Update run_translation_no_trainer.py (#18637)

* Update run_translation_no_trainer.py found an error in selecting `no_decay` parameters and some small modifications when the user continues to train from a checkpoint * fixs `no_decay` and `resume_step` issue 1. change `no_decay` list 2. if use continue to train their model from provided checkpoint, the `resume_step` will not be initialized properly if `args.gradient_accumulation_steps != 1`

Update run_translation_no_trainer.py (#18637)
* Update run_translation_no_trainer.py found an error in selecting `no_decay` parameters and some small modifications when the user continues to train from a checkpoint * fixs `no_decay` and `resume_step` issue 1. change `no_decay` list 2. if use continue to train their model from provided checkpoint, the `resume_step` will not be initialized properly if `args.gradient_accumulation_steps != 1`
25e651a2 · zhoutang776 · GitHub · a27195b1 · 25e651a2 · 25e651a2
Unverified Commit 25e651a2 authored Aug 16, 2022 by zhoutang776 Committed by GitHub Aug 16, 2022
3 changed files
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -464,7 +464,7 @@ def main():
    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
+    no_decay = ["bias", "layer_norm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
@@ -558,10 +558,15 @@ def main():
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+    completed_steps = starting_epoch * num_update_steps_per_epoch
    for epoch in range(starting_epoch, args.num_train_epochs):
        model.train()
        if args.with_tracking:
@@ -570,6 +575,8 @@ def main():
            # We need to skip steps until we reach the resumed step
            if args.resume_from_checkpoint and epoch == starting_epoch:
                if resume_step is not None and step < resume_step:
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
                        completed_steps += 1
                    continue

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -602,10 +602,15 @@ def main():
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+    completed_steps = starting_epoch * num_update_steps_per_epoch
    for epoch in range(starting_epoch, args.num_train_epochs):
        model.train()
        if args.with_tracking:
@@ -614,6 +619,8 @@ def main():
            # We need to skip steps until we reach the resumed step
            if args.resume_from_checkpoint and epoch == starting_epoch:
                if resume_step is not None and step < resume_step:
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
                        completed_steps += 1
                    continue

--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -510,7 +510,7 @@ def main():
    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
+    no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
@@ -607,10 +607,15 @@ def main():
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
-            resume_step = int(training_difference.replace("step_", ""))
+            # need to multiply `gradient_accumulation_steps` to reflect real steps
+            resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)
+    # update the progress_bar if load from checkpoint
+    progress_bar.update(starting_epoch * num_update_steps_per_epoch)
+    completed_steps = starting_epoch * num_update_steps_per_epoch
    for epoch in range(starting_epoch, args.num_train_epochs):
        model.train()
        if args.with_tracking:
@@ -619,6 +624,8 @@ def main():
            # We need to skip steps until we reach the resumed step
            if args.resume_from_checkpoint and epoch == starting_epoch:
                if resume_step is not None and step < resume_step:
+                    if step % args.gradient_accumulation_steps == 0:
+                        progress_bar.update(1)
                        completed_steps += 1
                    continue
            outputs = model(**batch)