Trainer with grad accum (#6930)

* Add warning for gradient accumulation * Formatting

Trainer with grad accum (#6930)
* Add warning for gradient accumulation * Formatting
08de989a · Sylvain Gugger · GitHub · d4aa7284 · 08de989a · 08de989a
Unverified Commit 08de989a authored Sep 07, 2020 by Sylvain Gugger Committed by GitHub Sep 07, 2020
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 0 deletions

src/transformers/training_args.py src/transformers/training_args.py +6 -0

src/transformers/training_args_tf.py src/transformers/training_args_tf.py +6 -0

No files found.
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -60,6 +60,12 @@ class TrainingArguments:
            The batch size per GPU/TPU core/CPU for evaluation.
        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+            .. warning::
+                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
+                examples.
        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
            The initial learning rate for Adam.
        weight_decay (:obj:`float`, `optional`, defaults to 0):

--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -42,6 +42,12 @@ class TFTrainingArguments(TrainingArguments):
            The batch size per GPU/TPU core/CPU for evaluation.
        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+            .. warning::
+                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
+                examples.
        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
            The initial learning rate for Adam.
        weight_decay (:obj:`float`, `optional`, defaults to 0):