WIP: Handle arbitrary combinations of optimizers/models/losses (#232)

* Refactor to allow more flexible treatment of multiple optimizers/models/losses * Adding _process_optimizers.py * Created L0 tests (now passing). * fix: minor print typo (#234) * make L1 results easier to read * L0 multiple model/optimizer/loss test fleshed out * Adding test that master params remain synced across distributed processes * Docstring updates * Docstring updates

WIP: Handle arbitrary combinations of optimizers/models/losses (#232)
* Refactor to allow more flexible treatment of multiple optimizers/models/losses * Adding _process_optimizers.py * Created L0 tests (now passing). * fix: minor print typo (#234) * make L1 results easier to read * L0 multiple model/optimizer/loss test fleshed out * Adding test that master params remain synced across distributed processes * Docstring updates * Docstring updates
3f87614f · mcarilli · GitHub · 214fda42 · 3f87614f · 3f87614f
Unverified Commit 3f87614f authored Apr 03, 2019 by mcarilli Committed by GitHub Apr 03, 2019
20 changed files
--- a/apex/amp/__init__.py
+++ b/apex/amp/__init__.py
@@ -2,4 +2,4 @@ from .amp import init, half_function, float_function, promote_function,\
    register_half_function, register_float_function, register_promote_function
 from .handle import scale_loss, disable_casts
 from .frontend import initialize
-from ._amp_state import master_params
+from ._amp_state import master_params, _amp_state
--- a/apex/amp/_amp_state.py
+++ b/apex/amp/_amp_state.py
@@ -17,6 +17,7 @@ else:
 class AmpState(object):
    def __init__(self):
        self.hard_override=False
+        self.allow_incoming_model_not_fp32 = False
        self.verbosity=1



--- a/apex/amp/_initialize.py
+++ b/apex/amp/_initialize.py
@@ -6,6 +6,7 @@ import warnings
 from ._amp_state import _amp_state, warn_or_err, container_abcs
 from .handle import disable_casts
 from .scaler import LossScaler
+from ._process_optimizer import _process_optimizer
 from apex.fp16_utils import convert_network
 from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
 from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
@@ -122,7 +123,7 @@ def wrap_fused_adam(optimizer, properties):
        return FP16_Optimizer_for_fused(optimizer, static_loss_scale=properties.loss_scale)


-def _initialize(models, optimizers, properties):
+def _initialize(models, optimizers, properties, num_losses=1):
    from apex.parallel import DistributedDataParallel as apex_DDP
    from .amp import init as amp_init

@@ -146,7 +147,8 @@ def _initialize(models, optimizers, properties):

    check_models(models)

-    check_params_fp32(models)
+    if not _amp_state.allow_incoming_model_not_fp32:
+        check_params_fp32(models)
    
    check_optimizers(optimizers)

@@ -181,21 +183,16 @@ def _initialize(models, optimizers, properties):
        for optimizer in optimizers:
            optimizer.load_state_dict(optimizer.state_dict())

-    if properties.master_weights:
-        for i, optimizer in enumerate(optimizers):
-            if isinstance(optimizer, FusedAdam):
-                optimizers[i] = wrap_fused_adam(optimizer, properties)
-            if properties.loss_scale == "dynamic":
-                optimizers[i] = FP16_Optimizer_general(optimizer,
-                                                       dynamic_loss_scale=True,
-                                                       verbose=False)
-            else:
-                optimizers[i] = FP16_Optimizer_general(optimizer,
-                                                       static_loss_scale=properties.loss_scale,
-                                                       verbose=False)
-    else:
-        for optimizer in optimizers:
-            optimizer.loss_scaler = LossScaler(properties.loss_scale)
+    for i, optimizer in enumerate(optimizers):
+        # Still need to special case this for the first pass
+        if isinstance(optimizer, FusedAdam):
+            optimizers[i] = wrap_fused_adam(optimizer, properties)
+        else:
+            optimizers[i] = _process_optimizer(optimizer, properties)
+
+    _amp_state.loss_scalers = []
+    for _ in range(num_losses):
+        _amp_state.loss_scalers.append(LossScaler(properties.loss_scale))

    if properties.patch_torch_functions:
        # handle is unused here. It's accessible later through a global value anyway.

--- a/apex/amp/_process_optimizer.py
+++ b/apex/amp/_process_optimizer.py
+import types
+from ..fp16_utils import master_params_to_model_params
+from ..multi_tensor_apply import multi_tensor_applier
+from ._amp_state import maybe_print
+import torch
+
+
+class AmpOptimizerState(object):
+    def __init__(self):
+        pass
+
+
+def lazy_init_with_master_weights(self):
+        stash = self._amp_stash
+        stash.fp16_groups = []
+        stash.fp32_from_fp16_groups = []
+        stash.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.param_groups):
+            # maybe_print("FP16_Optimizer processing param group {}:".format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
+                        #             .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.state:
+                           self.state[master_param] = self.state.pop(param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
+                        #             .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError("Optimizer's parameters must be either "
+                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                        "Received {}".format(param.type()))
+
+            stash.fp16_groups.append(fp16_params_this_group)
+            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        stash.all_fp16_params = []
+        for group in stash.fp16_groups:
+            stash.all_fp16_params += group
+
+        stash.all_fp32_from_fp16_params = []
+        for group in stash.fp32_from_fp16_groups:
+            stash.all_fp32_from_fp16_params += group
+
+        stash.all_fp32_from_fp32_params = []
+        for group in stash.fp32_from_fp32_groups:
+            stash.all_fp32_from_fp32_params += group
+
+        # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
+        stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
+
+        for param in stash.all_fp32_from_fp16_params:
+            param.grad = None
+
+        for param in stash.all_fp32_from_fp32_params:
+            param.grad = None
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.load_state_dict(self.state_dict())
+
+
+def prepare_backward_with_master_weights(self):
+    stash = self._amp_stash
+
+    if not stash.lazy_init_called:
+        self._lazy_init_maybe_master_weights()
+        stash.lazy_init_called = True
+
+    for i, param in enumerate(stash.all_fp16_params):
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+    # for i, param in enumerate(stash.all_fp32_from_fp16_params):
+    #     stash.all_fp32_from_fp16_grad_stash[i] = param.grad
+
+    for i, param in enumerate(stash.all_fp32_from_fp32_params):
+        stash.all_fp32_from_fp32_grad_stash[i] = param.grad
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+
+def post_backward_with_master_weights(self, scaler):
+    stash = self._amp_stash
+
+    # This is a lot of python overhead...
+    fp16_grads_needing_unscale = []
+    new_fp32_grads = []
+    fp16_grads_needing_unscale_with_stash = []
+    preexisting_fp32_grads = []
+    for fp16_param, fp32_param in zip(stash.all_fp16_params,
+                                      stash.all_fp32_from_fp16_params):
+        if fp16_param.grad is None and fp32_param.grad is not None:
+            continue
+        elif fp16_param.grad is not None and fp32_param.grad is None:
+            fp32_param.grad = torch.empty_like(fp32_param) 
+            fp16_grads_needing_unscale.append(fp16_param.grad)
+            new_fp32_grads.append(fp32_param.grad)
+        elif fp16_param.grad is not None and fp32_param.grad is not None:
+            fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
+            preexisting_fp32_grads.append(fp32_param.grad)
+        else: # fp16_param.grad is None and fp32_param.grad is None:
+            continue
+
+    if len(fp16_grads_needing_unscale) > 0:
+        scaler.unscale(
+            fp16_grads_needing_unscale,
+            new_fp32_grads,
+            scaler.loss_scale(),
+            models_are_masters=False)
+
+    if len(fp16_grads_needing_unscale_with_stash) > 0:
+        scaler.unscale_with_stashed(
+            fp16_grads_needing_unscale_with_stash,
+            preexisting_fp32_grads,
+            preexisting_fp32_grads)
+
+    # fp32 params can be treated as they would be in the "no_master_weights" case.
+    grads_needing_unscale = []
+    grads_needing_unscale_with_stash = []
+    stashed = []
+    for param, stashed_grad in zip(stash.all_fp32_from_fp32_params,
+                                   stash.all_fp32_from_fp32_grad_stash):
+        if param.grad is None and stashed_grad is not None:
+            param.grad = stashed_grad
+        elif param.grad is not None and stashed_grad is None:
+            grads_needing_unscale.append(param.grad)
+        elif param.grad is not None and stashed_grad is not None:
+            grads_needing_unscale_with_stash.append(param.grad)
+            stashed.append(stashed_grad)
+        else: # param.grad is None and stashed_grad is None:
+            continue
+
+    if len(grads_needing_unscale) > 0:
+        scaler.unscale(
+            grads_needing_unscale,
+            grads_needing_unscale,
+            scaler.loss_scale(),
+            models_are_masters=True)
+
+    if len(grads_needing_unscale_with_stash) > 0:
+        scaler.unscale_with_stashed(
+            grads_needing_unscale_with_stash,
+            stashed,
+            grads_needing_unscale_with_stash)
+
+    # Clear the stash.
+    for i in range(len(stash.all_fp32_from_fp32_grad_stash)):
+        stash.all_fp32_from_fp32_grad_stash[i] = None
+
+
+def lazy_init_no_master_weights(self):
+    stash = self._amp_stash
+    stash.all_fp16_params = []
+    stash.all_fp32_params = []
+    for i, param_group in enumerate(self.param_groups):
+        for i, param in enumerate(param_group['params']):
+            if param.type() == 'torch.cuda.HalfTensor':
+                stash.all_fp16_params.append(param)
+            elif param.type() == 'torch.cuda.FloatTensor':
+                stash.all_fp32_params.append(param)
+            else:
+                raise TypeError("Optimizer's parameters must be either "
+                                "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                "Received {}".format(param.type()))
+    
+    stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
+    stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]
+
+
+def prepare_backward_no_master_weights(self):
+    stash = self._amp_stash
+
+    if not stash.lazy_init_called:
+        self._lazy_init_maybe_master_weights()
+        stash.lazy_init_called = True
+
+    for i, param in enumerate(stash.all_fp16_params):
+        stash.all_fp16_grad_stash[i] = param.grad
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+    for i, param in enumerate(stash.all_fp32_params):
+        stash.all_fp32_grad_stash[i] = param.grad
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+
+def post_backward_no_master_weights(self, scaler):
+    stash = self._amp_stash
+
+    split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
+             (stash.all_fp32_params, stash.all_fp32_grad_stash))
+
+    for params, stashed_grads in split_types:
+        # This is a lot of python overhead...
+        grads_needing_unscale = []
+        grads_needing_unscale_with_stash = []
+        stashed = []
+        for param, stashed_grad in zip(params, stashed_grads):
+            if param.grad is None and stashed_grad is not None:
+                param.grad = stashed_grad
+            elif param.grad is not None and stashed_grad is None:
+                grads_needing_unscale.append(param.grad)
+            elif param.grad is not None and stashed_grad is not None:
+                grads_needing_unscale_with_stash.append(param.grad)
+                stashed.append(stashed_grad)
+            else: # param.grad is None and stashed_grad is None
+                continue
+
+        if len(grads_needing_unscale) > 0:
+            scaler.unscale(
+                grads_needing_unscale,
+                grads_needing_unscale,
+                scaler.loss_scale(),
+                models_are_masters=True)
+
+        if len(grads_needing_unscale_with_stash) > 0:
+            scaler.unscale_with_stashed(
+                grads_needing_unscale_with_stash,
+                stashed,
+                grads_needing_unscale_with_stash)
+
+        # Clear the stash.
+        for i in range(len(stashed_grads)):
+            stashed_grads[i] = None
+
+
+def _master_params_to_model_params(self):
+    stash = self._amp_stash
+    if multi_tensor_applier.available:
+        if len(stash.all_fp16_params) > 0:
+            multi_tensor_applier(
+                stash.multi_tensor_scale,
+                stash.dummy_overflow_buf,
+                [stash.all_fp32_from_fp16_params, stash.all_fp16_params],
+                1.0)
+    else:
+        for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+
+def _process_optimizer(optimizer, properties):
+    if hasattr(optimizer, "_amp_stash"):
+        raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
+    else:
+        optimizer._amp_stash = AmpOptimizerState()
+
+    optimizer._amp_stash.lazy_init_called = False
+    optimizer._amp_stash.already_patched = False
+
+    for name in ("_lazy_init_maybe_master_weights",
+                 "_master_params_to_model_params",
+                 "_prepare_amp_backward",
+                 "_post_amp_backward"):
+        if hasattr(optimizer, name):
+            raise RuntimeError("Incoming optimizer already has {} defined.".format(name))
+
+    # TODO:  Centralize exposure and import error checking for the C backend.
+    if multi_tensor_applier.available:
+        import amp_C
+        optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
+        optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);
+
+    if properties.master_weights:
+        optimizer._lazy_init_maybe_master_weights = types.MethodType(
+            lazy_init_with_master_weights, optimizer)
+
+        optimizer._master_params_to_model_params = types.MethodType(
+            _master_params_to_model_params, optimizer)
+
+        old_step = optimizer.step
+        def new_step(self):
+            retval = old_step()
+            self._master_params_to_model_params()
+            # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+            for param in self._amp_stash.all_fp32_from_fp16_params:
+                param.grad = None
+            return retval
+        optimizer.step = types.MethodType(new_step, optimizer)
+
+        old_zero_grad = optimizer.zero_grad
+        def new_zero_grad(self):
+            stash = self._amp_stash
+            if not stash.lazy_init_called:
+                self._lazy_init_maybe_master_weights()
+                stash.lazy_init_called = True
+            # Zero the model grads.
+            for param in stash.all_fp16_params:
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad.zero_()
+            for param in stash.all_fp32_from_fp32_params:
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad.zero_()
+            # Clear the master grads that are independent of model grads
+            for param in self._amp_stash.all_fp32_from_fp16_params:
+                param.grad = None
+        optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
+
+        optimizer._prepare_amp_backward = types.MethodType(
+            prepare_backward_with_master_weights, optimizer)
+
+        optimizer._post_amp_backward = types.MethodType(
+            post_backward_with_master_weights, optimizer)
+    else:
+        optimizer._lazy_init_maybe_master_weights = types.MethodType(
+            lazy_init_no_master_weights, optimizer)
+
+        optimizer._prepare_amp_backward = types.MethodType(
+            prepare_backward_no_master_weights, optimizer)
+
+        optimizer._post_amp_backward = types.MethodType(
+            post_backward_no_master_weights, optimizer)
+
+    return optimizer
--- a/apex/amp/frontend.py
+++ b/apex/amp/frontend.py
@@ -53,12 +53,13 @@ class Properties(object):
                # print("setting {} {}".format(name, value))
                if name == "cast_model_type":
                    if self.opt_level == "O1" and value is not None:
-                        if value is not torch.float32:
-                            warn_or_err("O1 inserts casts around Torch functions rather than "
-                                        "model weights, so with O1, the model weights themselves "
-                                        "should remain FP32. If you wish to cast the model to a "
-                                        "different type, use opt_level='O2' or 'O3'. " +
-                                        "cast_model_type was {}".format(value))
+                        if value is not False:
+                            if value is not torch.float32:
+                                warn_or_err("O1 inserts casts around Torch functions rather than "
+                                            "model weights, so with O1, the model weights themselves "
+                                            "should remain FP32. If you wish to cast the model to a "
+                                            "different type, use opt_level='O2' or 'O3'. " +
+                                            "cast_model_type was {}".format(value))
                    self.options[name] = value
                elif name == "patch_torch_functions":
                    if self.opt_level != "O1" and value:
@@ -200,20 +201,28 @@ def initialize(
    keep_batchnorm_fp32=None,
    master_weights=None,
    loss_scale=None,
+    num_losses=1,
    verbosity=1,
    ):
    """
    Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
    chosen ``opt_level`` and overridden properties, if any.

-    ``amp.initialize`` must be called **after** you have finished constructing your model(s) and
+    ``amp.initialize`` should be called **after** you have finished
+    constructing your model(s) and
    optimizer(s), but **before** you send your model through any DistributedDataParallel wrapper.
    See `Distributed training`_ in the Imagenet example.

+    Currently, ``amp.initialize`` should only be called **once**,
+    although it can process an arbitrary number of
+    models and optimizers (see the corresponding `Advanced Amp Usage topic`_).
+    If you think your use case requires ``amp.initialize`` to be called more than once,
+    `let us know`_.
+
    Any property keyword argument that is not ``None`` will be interpreted as a manual override.

    To prevent having to rewrite anything else in your script, name the returned models/optimizers
-    to replace the passed models/optimizers, as in the Usage below.
+    to replace the passed models/optimizers, as in the code sample below.

    Args:
        models (torch.nn.Module or list of torch.nn.Modules):  Models to modify/cast.
@@ -229,8 +238,15 @@ def initialize(
        keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.  If
            passed as a string, must be the string "True" or "False".
        master_weights (bool, optional, default=None):  Optional property override.
-        loss_scale (float or str, default=None):  Optional property override.  If passed as a string,
+        loss_scale (float or str, optional, default=None):  Optional property override.  If passed as a string,
            must be a string representing a number, e.g., "128.0", or the string "dynamic".
+        num_losses (int, optional, default=1):  Option to tell Amp in advance how many losses/backward
+            passes you plan to use.  When used in conjunction with the ``loss_id`` argument to
+            ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass,
+            which can improve stability.  See "Multiple models/optimizers/losses"
+            under `Advanced Amp Usage`_ for examples.  If ``num_losses`` is left to 1, Amp will still
+            support multiple losses/backward passes, but use a single global loss scale
+            for all of them.
        verbosity (int, default=1):  Set to 0 to suppress Amp-related output.

    Returns:
@@ -238,7 +254,7 @@ def initialize(
        If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
        also be a list.

-    Usage::
+    Permissible invocations::

        model, optim = amp.initialize(model, optim,...)
        model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
@@ -268,6 +284,15 @@ def initialize(

    .. _`Imagenet example`:
        https://github.com/NVIDIA/apex/tree/master/examples/imagenet
+
+    .. _`Advanced Amp Usage`:
+        https://nvidia.github.io/apex/advanced.html
+
+    .. _`Advanced Amp Usage topic`:
+        https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses
+
+    .. _`let us know`:
+        https://github.com/NVIDIA/apex/issues
    """
    _amp_state.opt_properties = Properties()
    _amp_state.verbosity = verbosity
@@ -308,7 +333,7 @@ def initialize(
    for k, v in _amp_state.opt_properties.options.items():
        maybe_print("{:22} : {}".format(k, v), True)

-    return _initialize(models, optimizers, _amp_state.opt_properties)
+    return _initialize(models, optimizers, _amp_state.opt_properties, num_losses)


 # TODO:  is this necessary/useful?

--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
@@ -13,7 +13,8 @@ from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
 # There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
 @contextlib.contextmanager
 def scale_loss(loss,
-               optimizer,
+               optimizers,
+               loss_id=0,
               model=None,
               delay_unscale=False):
    """
@@ -44,18 +45,29 @@ def scale_loss(loss,
            manager yields is simply ``loss.float()*loss_scale``, so in principle
            ``loss`` could have more than one element, as long as you call
            ``backward()`` on ``scaled_loss`` appropriately within the context manager body.
-        optimizer:  Must be an optimizer returned from an earlier call to ``amp.initialize``.
+        optimizers:  All optimizer(s) for which the current backward pass is creating gradients.
+            Must be an optimizer or list of optimizers returned from an earlier call
+            to ``amp.initialize``.  For example use with multiple optimizers, see
+            "Multiple models/optimizers/losses" under `Advanced Amp Usage`_.
+        loss_id(int, optional, default=0):  When used in conjunction with the ``num_losses`` argument
+            to ``amp.initialize``, enables Amp to use a different loss scale per loss.  ``loss_id``
+            must be an integer between 0 and ``num_losses`` that tells Amp which loss is
+            being used for the current backward pass.  See "Multiple models/optimizers/losses"
+            under `Advanced Amp Usage`_ for examples.  If ``loss_id`` is left unspecified, Amp
+            will use the default global loss scaler for this backward pass.
        model(torch.nn.Module, optional, default=None):  Currently unused, reserved to enable future
            optimizations.
-        delay_unscale(bool, default=False):  Don't unscale the gradients or perform model->master
-            gradient copies on context manager exit.  `Advanced Amp Usage`_ illustrates
-            situations where this is necessary.
+        delay_unscale(bool, optional, default=False):  ``delay_unscale`` is a ninja option that only
+            serves as a minor performance optimization, so only use it if you know what you're doing.
+            If ``True``, Amp will not unscale the gradients or perform model->master
+            gradient copies on context manager exit.
+            "Gradient accumulation across iterations" under `Advanced Amp Usage`_
+            illustrates a situation where this CAN (but does not need to) be used.

    .. warning::
        If ``delay_unscale`` is ``True`` for a given backward pass, ``optimizer.step()`` cannot be
        called yet after context manager exit, and must wait for another, later backward context
        manager invocation with ``delay_unscale`` left to False.
-        See `Advanced Amp Usage`_ for examples.

    .. _`Advanced Amp Usage`:
        https://nvidia.github.io/apex/advanced.html
@@ -64,18 +76,19 @@ def scale_loss(loss,
        yield loss
        return

-    if optimizer.loss_scaler is None:
-        raise RuntimeError("optimizer passed to scale_loss does not have a loss_scaler.")
+    if isinstance(optimizers, torch.optim.Optimizer):
+        optimizers = [optimizers]

    # this is what happens when i have to support tools from different sources under the same API...
    # TODO:  Rewrite FusedAdam to use multi-tensor apply and the same loss scaler.
-    if isinstance(optimizer, FP16_Optimizer_for_fused):
-        loss_scale = optimizer.cur_scale
+    if isinstance(optimizers, FP16_Optimizer_for_fused):
+        loss_scale = optimizers.cur_scale
    else:
-        loss_scale = optimizer.loss_scaler.loss_scale()
+        loss_scaler = _amp_state.loss_scalers[loss_id]
+        loss_scale = loss_scaler.loss_scale()

    if ((not _amp_state.opt_properties.master_weights)
-        and (not optimizer.loss_scaler.dynamic)
+        and (not loss_scaler.dynamic)
        and loss_scale == 1.0):
        yield loss.float()
        # Needing to drop the cache here as well is an ugly gotcha.
@@ -85,34 +98,42 @@ def scale_loss(loss,
            _amp_state.handle._clear_cache()
        return

+    if isinstance(optimizers, list):
+        for optimizer in optimizers:
+            optimizer._prepare_amp_backward()
+
    yield (loss.float())*loss_scale

-    # this isn't pretty but it unifies things.  Once I deprecate the old API entirely,
-    # I will have freedom to clean this up.  Maybe instead of wrapping optimizers,
-    # I can simply construct a set of attributes (e.g. master params) and assign them
-    # directly to optimizer instances.
    if not delay_unscale:
-        # The FP16_Optimizer for FusedAdam will take care of unscaling as part of
-        # its step() method.
-        if not isinstance(optimizer, FP16_Optimizer_for_fused):
-            if isinstance(optimizer, FP16_Optimizer_general):
-                optimizer.update_master_grads()
-            else:
-                optimizer.loss_scaler.clear_overflow_state()
-                optimizer.loss_scaler.unscale(
-                    master_params(optimizer),
-                    master_params(optimizer),
-                    loss_scale)
-                # For future fused optimizers that enable sync-free dynamic loss scaling,
-                # should_skip will always be False.
-                should_skip = optimizer.loss_scaler.update_scale()
-                if should_skip:
-                    optimizer_step = optimizer.step
-                    def skip_step():
-                        maybe_print("Gradient overflow.  Skipping step, reducing " +
-                                    "loss scale to {}".format(optimizer.loss_scaler.loss_scale()))
-                        optimizer.step = optimizer_step
-                    optimizer.step = skip_step
+        # FusedAdam and FusedSGD will take care of unscaling as part of their step() methods.
+        if not isinstance(optimizers, FP16_Optimizer_for_fused):
+            loss_scaler.clear_overflow_state()
+            for optimizer in optimizers:
+                optimizer._post_amp_backward(loss_scaler)
+            # For future fused optimizers that enable sync-free dynamic loss scaling,
+            # should_skip will always be False.
+            should_skip = loss_scaler.update_scale()
+            if should_skip:
+                for optimizer in optimizers:
+                    if not optimizer._amp_stash.already_patched:
+                        # Close on loss_scaler and loss_id as well, to be safe.  Probably not
+                        # necessary because amp.scale_loss is already creating a temporary scope.
+                        def patch_step(opt, loss_scaler, loss_id):
+                            opt_step = opt.step
+                            def skip_step():
+                                maybe_print(("Gradient overflow.  Skipping step, loss scaler " +
+                                             "{} reducing loss scale to {}").format(loss_id,
+                                             loss_scaler.loss_scale()))
+                                if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
+                                    # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
+                                        param.grad = None
+                                opt.step = opt_step
+                                opt._amp_stash.already_patched = False
+                            return skip_step
+                        optimizer.step = patch_step(optimizer, loss_scaler, loss_id)
+                        optimizer._amp_stash.already_patched = True
+
    # Probably ok to skip this if not delay_unscale
    if _amp_state.opt_properties.patch_torch_functions:
        _amp_state.handle._clear_cache()
@@ -151,6 +172,10 @@ class AmpHandle(object):

    @contextlib.contextmanager
    def scale_loss(self, loss, optimizer):
+        raise RuntimeError("The old Amp API is no longer supported.  Please move to the new API, "
+            "documented here:  https://nvidia.github.io/apex/amp.html.  Transition guide:  "
+            "https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users")
+
        if not self.is_active():
            yield loss
            return

--- a/apex/amp/scaler.py
+++ b/apex/amp/scaler.py
@@ -3,7 +3,7 @@ from ..multi_tensor_apply import multi_tensor_applier
 from ._amp_state import _amp_state, master_params, maybe_print
 from itertools import product

-def scale_check_overflow_python(model_grad, scale, master_grad, check_overflow=False):
+def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
    # Exception handling for 18.04 compatibility
    if check_overflow:
        cpu_sum = float(model_grad.float().sum())
@@ -16,6 +16,21 @@ def scale_check_overflow_python(model_grad, scale, master_grad, check_overflow=F
        master_grad.mul_(scale)
    return False

+def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, check_overflow=False):
+    # Exception handling for 18.04 compatibility
+    if check_overflow:
+        cpu_sum = float(model_grad.float().sum())
+        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+            return True
+
+    # if master_grad is not model_grad: # copy_ probably internally short-circuits this
+    #     master_grad.copy_(model_grad)
+    assert stashed_grad.dtype == master_grad.dtype
+    converted_model_grad = model_grad.to(master_grad.dtype)
+    stashed_grad.add_(scale, converted_model_grad)
+    master_grad.data = stashed_grad.data
+    return False
+
 class LossScaler(object):
    warned_no_fused_kernel = False
    warned_unscaling_non_fp32_grad = False
@@ -41,6 +56,7 @@ class LossScaler(object):
            import amp_C
            LossScaler.has_fused_kernel = multi_tensor_applier.available
            LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
+            LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
        else:
            if not LossScaler.warned_no_fused_kernel:
                maybe_print(
@@ -55,108 +71,124 @@ class LossScaler(object):
    def loss_scale(self):
        return self._loss_scale

-    def unscale_grads_python(self, model_grads, master_grads, scale):
+    def unscale_python(self, model_grads, master_grads, scale):
        for model, master in zip(model_grads, master_grads):
            if model is not None:
                if not LossScaler.warned_unscaling_non_fp32_grad:
-                    if master.type() != "torch.cuda.FloatTensor":
+                    if master.dtype != torch.float32:
                        maybe_print(
                            "Attempting to unscale a grad with type {} ".format(master.type()) +
                            "Unscaling non-fp32 grads may indicate an error. "
                            "When using Amp, you don't need to call .half() on your model.")
                        LossScaler.warned_unscaling_non_fp32_grad = True
-                self._has_overflow = scale_check_overflow_python(
-                    model,
-                    1./scale,
-                    master,
-                    self.dynamic)
+                self._has_overflow = scale_check_overflow_python(model,
+                                                                 master,
+                                                                 1./scale,
+                                                                 self.dynamic)
                if self._has_overflow and self.dynamic:
                    break

-    def clear_overflow_state(self):
-        self._has_overflow = False
-        if self.has_fused_kernel:
-            self._overflow_buf.zero_()
-
-    def unscale(self, model_params, master_params, scale):
+    # unused_scale keeps some of the old API alive for hopefully a short time.
+    def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False):
        if self._has_overflow:
            return

-        # Lots of defensive list processing going on here.  Way more less efficient than
-        # consuming the iterator directly.  Need to examine Python overhead.
-        model_master_params = [(model, master) for model, master
-            in zip(model_params, master_params)] # some of these may be None
+        scale = self._loss_scale
+
+        if scale == 1.0 and models_are_masters and not self.dynamic:
+            return

        if LossScaler.has_fused_kernel:
-            # TODO:  Make these lists permanent attributes of self, so they don't need to be created
-            # or garbage collected.  Profiler shows that garbage collection overhead may be
-            # substantial (200-300 usec).
-            # This may be tricky because right now the lists need to be packed densely.
-            # Maybe this could be handled within the multi_tensor_apply wrapper
-            # (allow some Tensors to be None using at::optional).
-            src_dst_pairs = {torch.float16 : {torch.float16 : [[],[]], torch.float32 : [[],[]]},
-                             torch.float32 : {torch.float16 : [[],[]], torch.float32 : [[],[]]}}
-
-            for model, master in model_master_params:
-                # Sync the None-ness of model and master params
-                if model.grad is None and master.grad is not None:
-                    master.grad = None
-                if model.grad is not None and master.grad is None:
-                    master.grad = torch.empty_like(master)
-
-                if model.grad is not None:
-                    if model.grad is master.grad and scale == 1.0 and not self.dynamic:
-                        continue
-                    else:
-                        src_dst_pairs[model.dtype][master.dtype][0].append(model.grad.data)
-                        src_dst_pairs[model.dtype][master.dtype][1].append(master.grad.data)
-
-            assert len(src_dst_pairs[torch.float32][torch.float16][0]) == 0, "The loss scaler is "\
-                "being asked to unscale FP32 model gradients into FP16 master gradients.  This is "\
-                "almost certainly an error."
-
-            for src, dst in product((torch.float16, torch.float32),
-                                    (torch.float16, torch.float32)):
-                if len(src_dst_pairs[src][dst][0]) > 0:
-                    if not LossScaler.warned_unscaling_non_fp32_grad and dst is torch.float16:
-                        print("Warning:  unscaling grads that are not FP32. "
-                              "Unscaling non-fp32 grads may indicate an error. "
-                              "When using Amp, you don't need to call .half() on your model.")
-                        # Setting this to True unconditionally allows the possibility of an escape
-                        # if never-before-seen non-fp32 grads are created in some later iteration.
+            # if (not LossScaler.warned_unscaling_non_fp32_grad
+            #     and master_grads[0].dtype == torch.float16):
+            #     print("Warning:  unscaling grads that are not FP32. "
+            #           "Unscaling non-fp32 grads may indicate an error. "
+            #           "When using Amp, you don't need to call .half() on your model.")
+            #     # Setting this to True unconditionally allows the possibility of an escape
+            #     # if never-before-seen non-fp32 grads are created in some later iteration.
+            #     LossScaler.warned_unscaling_non_fp32_grad = True
+            multi_tensor_applier(LossScaler.multi_tensor_scale_cuda,
+                                 self._overflow_buf,
+                                 [model_grads, master_grads],
+                                 1./scale)
+        else:
+            self.unscale_python(model_grads, master_grads, scale)
+
+        # Defer to update_scale
+        # If the fused kernel is available, we only need one D2H memcopy and sync.
+        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+        #     self._has_overflow = self._overflow_buf.item()
+
+    def unscale_with_stashed_python(self,
+                                    model_grads,
+                                    stashed_master_grads,
+                                    master_grads,
+                                    scale):
+        for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
+            if model is None and stashed is None:
+                continue
+            else:
+                if not LossScaler.warned_unscaling_non_fp32_grad:
+                    if master.dtype != torch.float32:
+                        maybe_print(
+                            "Attempting to unscale a grad with type {} ".format(master.type()) +
+                            "Unscaling non-fp32 grads may indicate an error. "
+                            "When using Amp, you don't need to call .half() on your model.")
                        LossScaler.warned_unscaling_non_fp32_grad = True
-                    multi_tensor_applier(
-                        LossScaler.multi_tensor_scale_cuda,
-                        self._overflow_buf,
-                        src_dst_pairs[src][dst],
-                        1./scale)
+                self._has_overflow = axpby_check_overflow_python(model,
+                                                                 stashed,
+                                                                 master,
+                                                                 1./scale,
+                                                                 self.dynamic)
+                if self._has_overflow and self.dynamic:
+                    break
+
+    def unscale_with_stashed(self,
+                             model_grads,
+                             stashed_master_grads,
+                             master_grads):
+        if self._has_overflow:
+            return
+
+        scale = self._loss_scale
+
+        if LossScaler.has_fused_kernel:
+            if (not LossScaler.warned_unscaling_non_fp32_grad
+                and master_grads[0].dtype == torch.float16):
+                print("Warning:  unscaling grads that are not FP32. "
+                      "Unscaling non-fp32 grads may indicate an error. "
+                      "When using Amp, you don't need to call .half() on your model.")
+                # Setting this to True unconditionally allows the possibility of an escape
+                # if never-before-seen non-fp32 grads are created in some later iteration.
+                LossScaler.warned_unscaling_non_fp32_grad = True
+            multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
+                                 self._overflow_buf,
+                                 [model_grads, stashed_master_grads, master_grads],
+                                 1./scale,
+                                 1.0,
+                                 0) # check only arg 0, aka the incoming model grads, for infs
        else:
-            # Sync the None-ness of model and master params.
-            all_same = True
-            for model, master in model_master_params:
-                if model.grad is None and master.grad is not None:
-                    master.grad = None
-                if model.grad is not None and master.grad is None:
-                    master.grad = torch.empty_like(master)
-                if model.grad is not master.grad:
-                    all_same = False
-
-            if scale == 1.0 and all_same and not self.dynamic:
-                return
-
-            # TODO:  Make these lists permanent attributes of self, so they don't need to be created
-            # or garbage collected?
-            model_grads = [mmp[0].grad.data for mmp in model_master_params if mmp[0].grad is not None]
-            master_grads = [mmp[1].grad.data for mmp in model_master_params if mmp[1].grad is not None]
-
-            self.unscale_grads_python(model_grads, master_grads, scale)
+            self.unscale_with_stashed_python(model_grads,
+                                             stashed_master_grads,
+                                             master_grads,
+                                             scale)

+        # Defer to update_scale
        # If the fused kernel is available, we only need one D2H memcopy and sync.
-        if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
-            self._has_overflow = self._overflow_buf.item()
+        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+        #     self._has_overflow = self._overflow_buf.item()
+
+    def clear_overflow_state(self):
+        self._has_overflow = False
+        if self.has_fused_kernel:
+            self._overflow_buf.zero_()

    # Separate so unscale() can be called more that once before updating.
    def update_scale(self):
+        # If the fused kernel is available, we only need one D2H memcopy and sync.
+        if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+            self._has_overflow = self._overflow_buf.item()
+
        if self._has_overflow and self.dynamic:
            should_skip = True
            self._loss_scale /= 2.

--- a/apex/fp16_utils/fp16_optimizer.py
+++ b/apex/fp16_utils/fp16_optimizer.py
@@ -540,18 +540,37 @@ class FP16_Optimizer(object):
        if len(self.all_fp16_params) > 0:
            # print("Model grads before")
            # print([param.grad.data for param in self.all_fp16_params])
+            # I'm ONLY writing this as an incremental way to make some tests pass until
+            # I can refactor the tests as well.
+            # FP16_Optimizer should not be used by anyone.
+            model_grads = []
+            master_grads = []
+            for model_param, master_param in zip(self.all_fp16_params,
+                                                 self.all_fp32_from_fp16_params):
+                if model_param.grad is not None:
+                    model_grads.append(model_param.grad)
+                    if master_param.grad is None:
+                        master_param.grad = torch.empty_like(master_param)
+                    master_grads.append(master_param.grad)
            self.loss_scaler.unscale(
-                self.all_fp16_params,
-                self.all_fp32_from_fp16_params,
+                model_grads,
+                master_grads,
                self.loss_scaler.loss_scale())
            # print("Master grads after")
            # print([param.grad.data for param in self.all_fp32_from_fp16_params])
        if len(self.all_fp32_from_fp32_params) > 0:
+            model_grads = []
+            master_grads = []
+            for model_param, master_param in zip(self.all_fp32_from_fp32_params,
+                                                 self.all_fp32_from_fp32_params):
+                if model_param.grad is not None:
+                    model_grads.append(model_param.grad)
+                    master_grads.append(master_param.grad)
            # print("Model grads before")
            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
            self.loss_scaler.unscale(
-                self.all_fp32_from_fp32_params,
-                self.all_fp32_from_fp32_params,
+                model_grads,
+                master_grads,
                self.loss_scaler.loss_scale())
            # print("Master grads after")
            # print([param.grad.data for param in self.all_fp32_from_fp32_params])

--- a/csrc/amp_C_frontend.cpp
+++ b/csrc/amp_C_frontend.cpp
@@ -11,7 +11,8 @@ void multi_tensor_axpby_cuda(
  at::Tensor noop_flag,
  std::vector<std::vector<at::Tensor>> tensor_lists,
  float a,
-  float b);
+  float b,
+  int arg_to_check);

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("multi_tensor_scale", &multi_tensor_scale_cuda,

--- a/csrc/multi_tensor_axpby_kernel.cu
+++ b/csrc/multi_tensor_axpby_kernel.cu
@@ -21,7 +21,8 @@ struct AxpbyFunctor
    volatile int* noop_gmem,
    TensorListMetadata<3>& tl,
    float a,
-    float b)
+    float b,
+    int arg_to_check)
  {
    // I'd like this kernel to propagate infs/nans.
    // if(*noop_gmem == 1)
@@ -68,13 +69,18 @@ struct AxpbyFunctor
      {
        int i = i_start + threadIdx.x + ii*blockDim.x;
        if(i < n && i < chunk_size)
-          if(isfinite(xs[ii]) && isfinite(ys[ii]))
-            out[i] = static_cast<out_t>(a*xs[ii] + b*ys[ii]);
-          else
-          {
-            out[i] = static_cast<out_t>(a*xs[ii] + b*ys[ii]);
+        {
+          out[i] = static_cast<out_t>(a*xs[ii] + b*ys[ii]);
+          bool finite = true;
+          if(arg_to_check == -1)
+            finite = (isfinite(xs[ii]) && isfinite(ys[ii]));
+          if(arg_to_check == 0)
+            finite = isfinite(xs[ii]);
+          if(arg_to_check == 1)
+            finite = isfinite(ys[ii]);
+          if(!finite)
            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
-          }
+        }
      }
    }
  }
@@ -85,7 +91,8 @@ void multi_tensor_axpby_cuda(
  at::Tensor noop_flag,
  std::vector<std::vector<at::Tensor>> tensor_lists,
  float a,
-  float b)
+  float b,
+  int arg_to_check)
 {
  using namespace at;
  // The output (downscaled) type is always float.
@@ -102,7 +109,8 @@ void multi_tensor_axpby_cuda(
             tensor_lists,
             AxpbyFunctor<scalar_t_0, scalar_t_1, scalar_t_2>(),
             a,
-             b); )))
+             b,
+             arg_to_check); )))

  AT_CUDA_CHECK(cudaGetLastError());


--- a/csrc/multi_tensor_scale_kernel.cu
+++ b/csrc/multi_tensor_scale_kernel.cu
@@ -65,13 +65,11 @@ struct ScaleFunctor
      {
        int i = i_start + threadIdx.x + ii*blockDim.x;
        if(i < n && i < chunk_size)
-          if(isfinite(incoming_vals[ii]))
-            out[i] = static_cast<out_t>(incoming_vals[ii]*scale);
-          else
-          {
-            out[i] = static_cast<out_t>(incoming_vals[ii]*scale);
+        {
+          out[i] = static_cast<out_t>(incoming_vals[ii]*scale);
+          if(!isfinite(incoming_vals[ii]))
            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
-          }
+        }
      }
    }
  }

--- a/docs/source/advanced.rst
+++ b/docs/source/advanced.rst
@@ -68,8 +68,11 @@ Forcing particular layers/functions to a desired type
 I'm still working on a generalizable exposure for this that won't require user-side code divergence
 across different ``opt-level``\ s.

-Multiple models/optimizers
--------------------------
+Multiple models/optimizers/losses
+---------------------------------
+
+Initialization with multiple models/optimizers
+**********************************************

 ``amp.initialize``'s optimizer argument may be a single optimizer or a list of optimizers,
 as long as the output you accept has the same type.
@@ -77,35 +80,88 @@ Similarly, the ``model`` argument may be a single model or a list of models, as
 output matches.  The following calls are all legal::

    model, optim = amp.initialize(model, optim,...)
-    model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
-    [model1, model2], optim = amp.initialize([model1, model2], optim,...)
-    [model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
+    model, [optim0, optim1] = amp.initialize(model, [optim0, optim1],...)
+    [model0, model1], optim = amp.initialize([model0, model1], optim,...)
+    [model0, model1], [optim0, optim1] = amp.initialize([model0, model1], [optim0, optim1],...)

-Whenever you invoke a backward pass, the optimizer you should pass to ``amp.scaled_loss`` is whatever
-optimizer owns the parameters for which this particular backward pass is creating gradients.
+Backward passes with multiple optimizers
+****************************************

-Multiple backward passes per iteration
--------------------------------------
+Whenever you invoke a backward pass, the ``amp.scale_loss`` context manager must receive
+**all the optimizers that own any params for which the current backward pass is creating gradients.**
+This is true even if each optimizer owns only some, but not all, of the params that are about to
+receive gradients.

-If you want to accumulate gradients from multiple losses for the params owned by a given optimizer,
-you must invoke ``with amp.scale_loss(..., delay_unscale=True)`` for all backward passes except
-the last::
+If, for a given backward pass, there's only one optimizer whose params are about to receive gradients,
+you may pass that optimizer directly to ``amp.scale_loss``.  Otherwise, you must pass the
+list of optimizers whose params are about to receive gradients::

-    # delay_unscale=True for the first two losses
-    with amp.scale_loss(loss1, optimizer, delay_unscale=True) as scaled_loss:
+    # loss0 accumulates gradients only into params owned by optim0:
+    with amp.scale_loss(loss0, optim0) as scaled_loss:
        scaled_loss.backward()
-    with amp.scale_loss(loss2, optimizer, delay_unscale=True) as scaled_loss:
+
+    # loss1 accumulates gradients only into params owned by optim1:
+    with amp.scale_loss(loss1, optim1) as scaled_loss:
        scaled_loss.backward()
-    # Don't delay_unscale for the final loss 
-    with amp.scale_loss(loss3, optimizer) as scaled_loss:
+
+    # loss2 accumulates gradients into some params owned by optim0
+    # and some params owned by optim1
+    with amp.scale_loss(loss2, [optim0, optim1]) as scaled_loss:
        scaled_loss.backward()
-    optimizer.step()

+Optionally have Amp use a different loss scaler per-loss
+********************************************************
+
+By default, Amp maintains a single global loss scaler that will be used for all backward passes
+(all invocations of ``with amp.scale_loss(...)``).  No additional arguments to ``amp.initialize``
+or ``amp.scale_loss`` are required to use the global loss scaler.  The code snippets above with
+multiple optimizers/backward passes use the single global loss scaler under the hood,
+and they should "just work."
+
+However, you can optionally tell Amp to maintain a loss scaler per-loss, which gives Amp increased
+numerical flexibility.  This is accomplished by supplying the ``num_losses`` argument to
+``amp.initialize`` (which tells Amp how many backward passes you plan to invoke, and therefore
+how many loss scalers Amp should create), then supplying the ``loss_id`` argument to each of your
+backward passes (which tells Amp the loss scaler to use for this particular backward pass)::
+
+    model, [optim0, optim1] = amp.initialize(model, [optim0, optim1], ..., num_losses=3)
+
+    with amp.scale_loss(loss0, optim0, loss_id=0) as scaled_loss:
+        scaled_loss.backward()
+
+    with amp.scale_loss(loss1, optim1, loss_id=1) as scaled_loss:
+        scaled_loss.backward()
+
+    with amp.scale_loss(loss2, [optim0, optim1], loss_id=2) as scaled_loss:
+        scaled_loss.backward()
+
+``num_losses`` and ``loss_id``\ s should be specified purely based on the set of
+losses/backward passes.  The use of multiple optimizers, or association of single or
+multiple optimizers with each backward pass, is unrelated.

 Gradient accumulation across iterations
 ---------------------------------------

-Pass ``delay_unscale=True`` to ``amp.scale_loss`` until you're ready to ``step()``::
+The following should "just work," and properly accommodate multiple models/optimizers/losses, as well as
+gradient clipping via the `instructions above`_::
+
+    if iter%iters_to_accumulate == 0:
+        # Every iters_to_accumulate iterations, unscale and step
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+        # Gradient clipping if desired:
+        # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
+        optimizer.step()
+        optimizer.zero_grad()
+    else:
+        # Otherwise, accumulate gradients, don't unscale or step.
+        with amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+
+As a minor performance optimization, you can pass ``delay_unscale=True``
+to ``amp.scale_loss`` until you're ready to ``step()``.  You should only attempt ``delay_unscale=True``
+if you're sure you know what you're doing, because the interaction with gradient clipping and
+multiple models/optimizers/losses can become tricky.::

    if iter%iters_to_accumulate == 0:
        # Every iters_to_accumulate iterations, unscale and step
@@ -114,10 +170,12 @@ Pass ``delay_unscale=True`` to ``amp.scale_loss`` until you're ready to ``step()
        optimizer.step()
        optimizer.zero_grad()
    else:
-        # Otherwise, just accumulate gradients, don't unscale or step. 
+        # Otherwise, accumulate gradients, don't unscale or step.
        with amp.scale_loss(loss, optimizer, delay_unscale=True) as scaled_loss:
            scaled_loss.backward()

+.. _`instructions above`:
+    https://nvidia.github.io/apex/advanced.html#gradient-clipping

 Custom data batch types
 -----------------------

--- a/docs/source/amp.rst
+++ b/docs/source/amp.rst
@@ -15,7 +15,10 @@ is under construction.

 If you already implemented Amp based on the instructions below, but it isn't behaving as expected,
 please review `Advanced Amp Usage`_ to see if any topics match your use case.  If that doesn't help,
-file an issue.
+`file an issue`_.
+
+.. _`file an issue`:
+    https://github.com/NVIDIA/apex/issues

 ``opt_level``\ s and Properties
 -------------------------------
@@ -109,9 +112,8 @@ Your incoming model should be FP32 already, so this is likely a no-op.
 |
 |

-``O1``:  Conservative Mixed Precision
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
+``O1``:  Mixed Precision (recommended for typical use)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Patch all Torch functions and Tensor methods to cast their inputs according to a whitelist-blacklist
 model.  Whitelist ops (for example, Tensor Core-friendly ops like GEMMs and convolutions) are performed
 in FP16.  Blacklist ops that benefit from FP32 precision (for example, softmax)
@@ -126,8 +128,8 @@ are performed in FP32.  ``O1`` also uses dynamic loss scaling, unless overridden
 |
 |

-``O2``:  Fast Mixed Precision
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+``O2``:  "Almost FP16" Mixed Precision
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ``O2`` casts the model weights to FP16,
 patches the model's ``forward`` method to cast input
 data to FP16, keeps batchnorms in FP32, maintains FP32 master weights,

--- a/tests/L0/run_amp/test_cache.py
+++ b/tests/L0/run_amp/test_cache.py
@@ -4,6 +4,7 @@ import functools as ft
 import itertools as it

 from apex import amp
+from apex.amp import _amp_state
 import torch
 from torch import nn
 import torch.nn.functional as F
@@ -60,24 +61,27 @@ class PromoteModule(torch.nn.Module):

 class TestCache(unittest.TestCase):
    def setUp(self):
-        self.handle = amp.init(enabled=True)
        self.x = torch.ones((2, 8), device='cuda', dtype=torch.float32)
        common_init(self)

    def tearDown(self):
-        self.handle._deactivate()
+        pass

    def train_eval_train_test(self, module, t):
        model = module(t).cuda()
-        dummy_optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+        optimizer = torch.optim.SGD(model.parameters(), lr=1.0)
+
+        _amp_state.allow_incoming_model_not_fp32 = True
+        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
+        _amp_state.allow_incoming_model_not_fp32 = False
        
        def training_step():
            for param in model.parameters():
                param.grad = None
        
            loss = model(self.x).sum()
-            self.handle._default_scaler._loss_scale = 4.0
-            with self.handle.scale_loss(loss, dummy_optimizer) as scaled_loss:
+            _amp_state.loss_scalers[0]._loss_scale = 4.0
+            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        
            self.assertEqual(len([p.grad for p in model.parameters() if p.grad is not None]), 1)
@@ -105,6 +109,8 @@ class TestCache(unittest.TestCase):
        
        # Simulates resuming training after eval
        training_step()
+
+        _amp_state.handle._deactivate()
   
    # I could easily have these as a set of for loops in a single test,
    # instead of going for granularity.

--- a/tests/L0/run_amp/test_multi_tensor_axpby.py
+++ b/tests/L0/run_amp/test_multi_tensor_axpby.py
@@ -54,7 +54,7 @@ class TestMultiTensorAxpby(unittest.TestCase):
        else:
            out_list = [out.clone().to(out_type)*3.0 for out in y_list]

-        applier(multi_tensor_axpby, self.overflow_buf, [x_list, y_list, out_list], self.a, self.b)
+        applier(multi_tensor_axpby, self.overflow_buf, [x_list, y_list, out_list], self.a, self.b, -1)

        self.assertTrue(all([torch.allclose(out, self.ref.to(out_type)) for out in out_list]),
                        msg="{} {} {} {} {} {} {}".format(sizea, sizeb, repeat_tensors,

--- a/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
+++ b/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
--- a/tests/L1/common/compare.py
+++ b/tests/L1/common/compare.py
@@ -6,6 +6,7 @@ parser.add_argument('--opt-level', type=str)
 parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
 parser.add_argument('--loss-scale', type=str, default=None)
 parser.add_argument('--fused-adam', action='store_true')
+parser.add_argument('--use_baseline', action='store_true')
 args = parser.parse_args()

 base_file = str(args.opt_level) + "_" +\
@@ -15,24 +16,49 @@ base_file = str(args.opt_level) + "_" +\

 file_e = "True_" + base_file
 file_p = "False_" + base_file
+if args.use_baseline:
+    file_b = "baselines/True_" + base_file

 dict_e = torch.load(file_e)
 dict_p = torch.load(file_p)
+if args.use_baseline:
+    dict_b = torch.load(file_b)

 torch.set_printoptions(precision=10)

 print(file_e)
 print(file_p)
+if args.use_baseline:
+    print(file_b)

-for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
-    assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
-
-    loss_e = dict_e["Loss"][n]
-    loss_p = dict_p["Loss"][n]
-    assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
-    print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
-          i_e,
-          loss_e,
-          loss_p,
-          dict_e["Speed"][n],
-          dict_p["Speed"][n]))
+# ugly duplication here...
+if not args.use_baseline:
+    for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
+        assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
+
+        loss_e = dict_e["Loss"][n]
+        loss_p = dict_p["Loss"][n]
+        assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
+        print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
+              i_e,
+              loss_e,
+              loss_p,
+              dict_e["Speed"][n],
+              dict_p["Speed"][n]))
+else:
+    for n, (i_e, i_p) in enumerate(zip(dict_e["Iteration"], dict_p["Iteration"])):
+        assert i_e == i_p, "i_e = {}, i_p = {}".format(i_e, i_p)
+
+        loss_e = dict_e["Loss"][n]
+        loss_p = dict_p["Loss"][n]
+        loss_b = dict_b["Loss"][n]
+        assert loss_e == loss_p, "Iteration {}, loss_e = {}, loss_p = {}".format(i_e, loss_e, loss_p)
+        assert loss_e == loss_b, "Iteration {}, loss_e = {}, loss_b = {}".format(i_e, loss_e, loss_b)
+        print("{:4} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f} {:15.10f}".format(
+              i_e,
+              loss_b,
+              loss_e,
+              loss_p,
+              dict_b["Speed"][n],
+              dict_e["Speed"][n],
+              dict_p["Speed"][n]))
--- a/tests/L1/common/run_test.sh
+++ b/tests/L1/common/run_test.sh
@@ -7,12 +7,13 @@ print_banner() {
 print_banner "Distributed status:  $1"

 echo $2
-if [ -n "$2" ]
+DATADIR=$2
+
+if [ -n "$3" ]
 then
-  DATADIR="$2"
+  USE_BASELINE=""
 else
-  # DATADIR="/home/mcarilli/Desktop/pt18data/apex/examples/imagenet/bare_metal_train_val/"
-  DATADIR="/opt/home/apex/examples/imagenet/"
+  USE_BASELINE="--use_baseline"
 fi

 if [ "$1" == "single_gpu" ]
@@ -130,7 +131,7 @@ do
      fi
      echo "${BASE_CMD} --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} [--has-ext] $DATADIR"
      set -x
-      python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm}
+      python compare.py --opt-level ${opt_level} ${loss_scale} ${keep_batchnorm} --use_baseline
      set +x
    done
  done

--- a/tests/L1/cross_product/run.sh
+++ b/tests/L1/cross_product/run.sh
 #!/bin/bash

+DATADIR="/home/mcarilli/Desktop/pt18data/apex_stale/examples/imagenet/bare_metal_train_val/"
+# DATADIR="/opt/home/apex/examples/imagenet/"
 cp ../common/* .
-bash run_test.sh single_gpu $1
+bash run_test.sh single_gpu $1 $DATADIR yes
--- a/tests/distributed/amp_master_params/amp_master_params.py
+++ b/tests/distributed/amp_master_params/amp_master_params.py
+import torch
+import argparse
+import os
+from apex import amp
+# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
+from apex.parallel import DistributedDataParallel
+
+parser = argparse.ArgumentParser()
+# FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
+# automatically by torch.distributed.launch.
+parser.add_argument("--local_rank", default=0, type=int)
+args = parser.parse_args()
+
+# FOR DISTRIBUTED:  If we are running under torch.distributed.launch,
+# the 'WORLD_SIZE' environment variable will also be set automatically.
+args.distributed = False
+if 'WORLD_SIZE' in os.environ:
+    args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+if args.distributed:
+    # FOR DISTRIBUTED:  Set the device according to local_rank.
+    torch.cuda.set_device(args.local_rank)
+
+    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
+    # environment variables, and requires that you use init_method=`env://`.
+    torch.distributed.init_process_group(backend='nccl',
+                                         init_method='env://')
+
+    torch.manual_seed(torch.distributed.get_rank())
+
+torch.backends.cudnn.benchmark = True
+
+N, D_in, D_out = 64, 1024, 16
+
+# Each process receives its own batch of "fake input data" and "fake target data."
+# The "training loop" in each process just uses this fake batch over and over.
+# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
+# example of distributed data sampling for both training and validation.
+x = torch.randn(N, D_in, device='cuda')
+y = torch.randn(N, D_out, device='cuda')
+
+model = torch.nn.Linear(D_in, D_out).cuda()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
+
+if args.distributed:
+    # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
+    # apex.parallel.DistributedDataParallel.
+    model = DistributedDataParallel(model)
+    # torch.nn.parallel.DistributedDataParallel is also fine, with some added args:
+    # model = torch.nn.parallel.DistributedDataParallel(model,
+    #                                                   device_ids=[args.local_rank],
+    #                                                   output_device=args.local_rank)
+
+loss_fn = torch.nn.MSELoss()
+
+for t in range(500):
+    optimizer.zero_grad()
+    y_pred = model(x)
+    loss = loss_fn(y_pred, y)
+    with amp.scale_loss(loss, optimizer) as scaled_loss:
+        scaled_loss.backward()
+    optimizer.step()
+
+if args.local_rank == 0:
+    print("final loss = ", loss)
+
+torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank()))
+torch.save(list(amp.master_params(optimizer)), "rank{}master.pth".format(torch.distributed.get_rank()))