fix the grad_acc issue at epoch boundaries (#24415)

* fix the grad_acc issue at epoch boundaries Co-Authored-By: Zach Mueller <7831895+muellerzr@users.noreply.github.com> * add contributors. Co-authored-by: sumpster * address comments --------- Co-authored-by: Zach Mueller <7831895+muellerzr@users.noreply.github.com>

fix the grad_acc issue at epoch boundaries (#24415)
* fix the grad_acc issue at epoch boundaries Co-Authored-By: Zach Mueller <7831895+muellerzr@users.noreply.github.com> * add contributors. Co-authored-by: sumpster * address comments --------- Co-authored-by: Zach Mueller <7831895+muellerzr@users.noreply.github.com>
c036c814 · Sourab Mangrulkar · GitHub · 468aed39 · c036c814
Unverified Commit c036c814 authored Jun 23, 2023 by Sourab Mangrulkar Committed by GitHub Jun 23, 2023
Show whitespace changes
Inline Side-by-side

Showing with 22 additions and 9 deletions

src/transformers/trainer.py src/transformers/trainer.py +22 -9

No files found.
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -196,7 +196,7 @@ if is_peft_available():
 if is_accelerate_available():
    from accelerate import Accelerator, skip_first_batches
    from accelerate import __version__ as accelerate_version
-    from accelerate.utils import DistributedDataParallelKwargs
+    from accelerate.utils import DistributedDataParallelKwargs, GradientAccumulationPlugin

    if version.parse(accelerate_version) > version.parse("0.20.3"):
        from accelerate.utils import (
@@ -1806,14 +1806,23 @@ class Trainer:

                self.current_flos += float(self.floating_point_ops(inputs))

-                # should this be under the accumulate context manager?
-                # the `or` condition of `steps_in_epoch <= args.gradient_accumulation_steps` is not covered
-                # in accelerate
-                if total_batched_samples % args.gradient_accumulation_steps == 0 or (
+                is_last_step_and_steps_less_than_grad_acc = (
+                    steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
+                )
+
+                if (
+                    total_batched_samples % args.gradient_accumulation_steps == 0
+                    or
                    # last step in epoch but step is always smaller than gradient_accumulation_steps
-                    steps_in_epoch <= args.gradient_accumulation_steps
-                    and (step + 1) == steps_in_epoch
+                    is_last_step_and_steps_less_than_grad_acc
+                ):
+                    # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
+                    # in accelerate. So, explicitly enable sync gradients to True in that case.
+                    if is_last_step_and_steps_less_than_grad_acc or (
+                        version.parse(accelerate_version) <= version.parse("0.20.3")
                    ):
+                        self.accelerator.gradient_state._set_sync_gradients(True)
+
                    # Gradient clipping
                    if args.max_grad_norm is not None and args.max_grad_norm > 0:
                        # deepspeed does its own clipping
@@ -3815,10 +3824,14 @@ class Trainer:
            self.repo.git_push()

    def create_accelerator_and_postprocess(self):
+        grad_acc_kwargs = {"num_steps": self.args.gradient_accumulation_steps}
+        if version.parse(accelerate_version) > version.parse("0.20.3"):
+            grad_acc_kwargs["sync_with_dataloader"] = False
+        gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
+
        # create accelerator object
        self.accelerator = Accelerator(
-            deepspeed_plugin=self.args.deepspeed_plugin,
-            gradient_accumulation_steps=self.args.gradient_accumulation_steps,
+            deepspeed_plugin=self.args.deepspeed_plugin, gradient_accumulation_plugin=gradient_accumulation_plugin
        )

        # deepspeed and accelerate flags covering both trainer args and accelerate launcher