Tie weights after preparing the model in run_clm (#18855)

c61f116b · Sylvain Gugger · GitHub · 1c381f36 · c61f116b
Unverified Commit c61f116b authored Sep 01, 2022 by Sylvain Gugger Committed by GitHub Sep 01, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

examples/pytorch/language-modeling/run_clm_no_trainer.py examples/pytorch/language-modeling/run_clm_no_trainer.py +4 -4

No files found.
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -477,10 +477,6 @@ def main():
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

-    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
-    if accelerator.distributed_type == DistributedType.TPU:
-        model.tie_weights()
-
    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -500,6 +496,10 @@ def main():
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

+    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    if accelerator.distributed_type == DistributedType.TPU:
+        model.tie_weights()
+
    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if overrode_max_train_steps: