[Trainer/Deepspeed] handle get_last_lr() before first step() (#10362)

* handle get_last_lr() before first step() * abstract away the lr getting logic * cleanup * add test * move to utils

[Trainer/Deepspeed] handle get_last_lr() before first step() (#10362)
* handle get_last_lr() before first step() * abstract away the lr getting logic * cleanup * add test * move to utils
3437d121 · Stas Bekman · GitHub · 4a1ab7cb · 3437d121 · 3437d121
Unverified Commit 3437d121 authored Feb 23, 2021 by Stas Bekman Committed by GitHub Feb 23, 2021
3 changed files
--- a/examples/tests/deepspeed/test_deepspeed.py
+++ b/examples/tests/deepspeed/test_deepspeed.py
@@ -78,6 +78,31 @@ class TrainerIntegrationDeepSpeed(TestCasePlus):
                trainer.train()
        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+    def test_early_get_last_lr(self):
+        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
+        # not run for the first few dozen steps while loss scale is too large, and thus during
+        # that time `get_last_lr` will fail if called during that warm up stage,
+        #
+        # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
+        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
+        with mockenv_context(**self.dist_env_1_gpu):
+            a = b = 0.0
+            trainer = get_regression_trainer(
+                a=a,
+                b=b,
+                local_rank=0,
+                train_len=8,
+                deepspeed=self.ds_config_file,
+                per_device_train_batch_size=8,
+                logging_steps=1,
+            )
+            trainer.train()
+            no_grad_accum_a = trainer.model.a.item()
+            # it's enough that train didn't fail for this test, but we must check that
+            # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
+            self.assertEqual(no_grad_accum_a, a)
    def test_gradient_accumulation(self):
        # this test measures that we get identical weights and similar loss with:

--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -82,6 +82,7 @@ from .trainer_pt_utils import (
    SequentialDistributedSampler,
    distributed_broadcast_scalars,
    distributed_concat,
+    get_learning_rate,
    nested_concat,
    nested_detach,
    nested_numpify,
@@ -1129,12 +1130,8 @@ class Trainer:
            tr_loss -= tr_loss
            logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
-            # backward compatibility for pytorch schedulers
+            logs["learning_rate"] = get_learning_rate(self)
-            logs["learning_rate"] = (
-                self.lr_scheduler.get_last_lr()[0]
-                if version.parse(torch.__version__) >= version.parse("1.4")
-                else self.lr_scheduler.get_lr()[0]
-            )
            self._total_loss_scalar += tr_loss_scalar
            self._globalstep_last_logged = self.state.global_step

--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -24,6 +24,7 @@ from typing import Iterator, List, Optional, Union
 import numpy as np
 import torch
+from packaging import version
 from torch.utils.data.dataset import Dataset
 from torch.utils.data.distributed import DistributedSampler
 from torch.utils.data.sampler import RandomSampler, Sampler
@@ -262,6 +263,29 @@ def _get_first_shape(arrays):
    return arrays.shape
+def get_learning_rate(trainer):
+    if trainer.deepspeed:
+        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
+        # not run for the first few dozen steps while loss scale is too large, and thus during
+        # that time `get_last_lr` will fail if called during that warm up stage, so work around it:
+        try:
+            last_lr = trainer.lr_scheduler.get_last_lr()[0]
+        except AssertionError as e:
+            if "need to call step" in str(e):
+                logger.warn("tried to get lr value before scheduler/optimizer started stepping, returning lr=0")
+                last_lr = 0
+            else:
+                raise
+    else:
+        last_lr = (
+            # backward compatibility for pytorch schedulers
+            trainer.lr_scheduler.get_last_lr()[0]
+            if version.parse(torch.__version__) >= version.parse("1.4")
+            else trainer.lr_scheduler.get_lr()[0]
+        )
+    return last_lr
 class DistributedTensorGatherer:
    """
    A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU by chunks.