Store FLOS as floats to avoid overflow. (#10213)

7169d1ea · Sylvain Gugger · GitHub · df1b0fb5 · 7169d1ea · 7169d1ea
Unverified Commit 7169d1ea authored Feb 16, 2021 by Sylvain Gugger Committed by GitHub Feb 16, 2021
Showing with 8 additions and 4 deletions

src/transformers/trainer.py src/transformers/trainer.py +1 -1

src/transformers/trainer_callback.py src/transformers/trainer_callback.py +4 -3

tests/test_trainer.py tests/test_trainer.py +3 -0

No files found.
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -959,7 +959,7 @@ class Trainer:
                        tr_loss += self.training_step(model, inputs)
                else:
                    tr_loss += self.training_step(model, inputs)
-                self._total_flos += self.floating_point_ops(inputs)
+                self._total_flos += float(self.floating_point_ops(inputs))

                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
                    # last step in epoch but step is always smaller than gradient_accumulation_steps

--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -52,8 +52,9 @@ class TrainerState:
            During training, represents the number of update steps completed.
        max_steps (:obj:`int`, `optional`, defaults to 0):
            The number of update steps to do during the current training.
-        total_flos (:obj:`int`, `optional`, defaults to 0):
-            The total number of floating operations done by the model since the beginning of training.
+        total_flos (:obj:`float`, `optional`, defaults to 0):
+            The total number of floating operations done by the model since the beginning of training (stored as floats
+            to avoid overflow).
        log_history (:obj:`List[Dict[str, float]]`, `optional`):
            The list of logs done since the beginning of training.
        best_metric (:obj:`float`, `optional`):
@@ -76,7 +77,7 @@ class TrainerState:
    global_step: int = 0
    max_steps: int = 0
    num_train_epochs: int = 0
-    total_flos: int = 0
+    total_flos: float = 0
    log_history: List[Dict[str, float]] = None
    best_metric: Optional[float] = None
    best_model_checkpoint: Optional[str] = None

--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -881,6 +881,9 @@ class TrainerIntegrationTest(unittest.TestCase):
        # with enforced DataParallel
        assert_flos_extraction(trainer, torch.nn.DataParallel(trainer.model))

+        trainer.train()
+        self.assertTrue(isinstance(trainer.state.total_flos, float))
+

 @require_torch
 @require_optuna