delay_unscale is never necessary and generally discouraged, but should still work for some cases

0750a757 · Michael Carilli · 3f87614f · 0750a757 · 0750a757
Commit 0750a757 authored Apr 04, 2019 by Michael Carilli
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 6 deletions

apex/amp/_process_optimizer.py apex/amp/_process_optimizer.py +1 -0

apex/amp/handle.py apex/amp/handle.py +13 -6

No files found.
--- a/apex/amp/_process_optimizer.py
+++ b/apex/amp/_process_optimizer.py
@@ -261,6 +261,7 @@ def _process_optimizer(optimizer, properties):
    optimizer._amp_stash.lazy_init_called = False
    optimizer._amp_stash.already_patched = False
+    optimizer._amp_stash.params_have_scaled_gradients = False
    for name in ("_lazy_init_maybe_master_weights",
                 "_master_params_to_model_params",

--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
@@ -57,8 +57,9 @@ def scale_loss(loss,
            will use the default global loss scaler for this backward pass.
        model(torch.nn.Module, optional, default=None):  Currently unused, reserved to enable future
            optimizations.
-        delay_unscale(bool, optional, default=False):  ``delay_unscale`` is a ninja option that only
+        delay_unscale(bool, optional, default=False):  ``delay_unscale`` is never necessary.
-            serves as a minor performance optimization, so only use it if you know what you're doing.
+            It's a minor ninja performance optimization and can result in weird gotchas (especially
+            with multiple models/optimzers/losses), so only use it if you know what you're doing.
            If ``True``, Amp will not unscale the gradients or perform model->master
            gradient copies on context manager exit.
            "Gradient accumulation across iterations" under `Advanced Amp Usage`_
@@ -98,18 +99,24 @@ def scale_loss(loss,
            _amp_state.handle._clear_cache()
        return
-    if isinstance(optimizers, list):
+    if not delay_unscale:
-        for optimizer in optimizers:
+        if isinstance(optimizers, list):
-            optimizer._prepare_amp_backward()
+            for optimizer in optimizers:
+                if not optimizer._amp_stash.params_have_scaled_gradients:
+                    optimizer._prepare_amp_backward()
    yield (loss.float())*loss_scale
-    if not delay_unscale:
+    if delay_unscale:
+        for optimizer in optimizers:
+            optimizer._amp_stash.params_have_scaled_gradients = True
+    else:
        # FusedAdam and FusedSGD will take care of unscaling as part of their step() methods.
        if not isinstance(optimizers, FP16_Optimizer_for_fused):
            loss_scaler.clear_overflow_state()
            for optimizer in optimizers:
                optimizer._post_amp_backward(loss_scaler)
+                optimizer._amp_stash.params_have_scaled_gradients = False
            # For future fused optimizers that enable sync-free dynamic loss scaling,
            # should_skip will always be False.
            should_skip = loss_scaler.update_scale()