Merging in master

843cdbe0 · Michael Carilli · 724672d7 · 28097c99 · 843cdbe0 · 843cdbe0
Commit 843cdbe0 authored Apr 18, 2019 by Michael Carilli
20 changed files
--- a/README.md
+++ b/README.md
@@ -10,9 +10,7 @@ users as quickly as possible.

 # Contents

-## 1. Mixed Precision
-
-### Amp:  Automatic Mixed Precision
+## 1. Amp:  Automatic Mixed Precision

 `apex.amp` is a tool to enable mixed precision training by changing only 3 lines of your script.
 Users can easily experiment with different pure and mixed precision training modes by supplying
@@ -78,7 +76,7 @@ It's often convenient to use Apex in Docker containers.  Compatible options incl
 For performance and full functionality, we recommend installing Apex with
 CUDA and C++ extensions via
 ```
-$ git clone apex
+$ git clone https://github.com/NVIDIA/apex
 $ cd apex
 $ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
 ```
@@ -95,6 +93,5 @@ A Python-only build omits:
 `DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.

 ### Windows support
-Windows support is experimental, and Linux is recommended.  `python setup.py install --cpp_ext --cuda_ext` may work if you were able to build Pytorch from source
-on your system.  `python setup.py install` (without CUDA/C++ extensions) is more likely to work.  If you installed Pytorch in a Conda environment,
-make sure to install Apex in that same environment.
+Windows support is experimental, and Linux is recommended.  `pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
+on your system.  `pip install -v --no-cache-dir .` (without CUDA/C++ extensions) is more likely to work.  If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
--- a/apex/amp/__init__.py
+++ b/apex/amp/__init__.py
@@ -2,4 +2,4 @@ from .amp import init, half_function, float_function, promote_function,\
    register_half_function, register_float_function, register_promote_function
 from .handle import scale_loss, disable_casts
 from .frontend import initialize
-from ._amp_state import master_params
+from ._amp_state import master_params, _amp_state
--- a/apex/amp/_amp_state.py
+++ b/apex/amp/_amp_state.py
@@ -2,13 +2,29 @@
 # I'm a C++ guy, not a python guy.  I decided this approach because it seemed most C++-like.  
 # But apparently it's ok:
 # http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
+import os
+import torch
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+if TORCH_MAJOR == 0:
+    import collections.abc as container_abcs
+else:
+    from torch._six import container_abcs
+
+
 class AmpState(object):
    def __init__(self):
        self.hard_override=False
+        self.allow_incoming_model_not_fp32 = False
+        self.verbosity=1
+

 # Attribute stash.  Could also just stash things as global module attributes.
 _amp_state = AmpState()

+
 def warn_or_err(msg):
    if _amp_state.hard_override:
        print("Warning:  " + msg)
@@ -18,11 +34,30 @@ def warn_or_err(msg):
        # + "  If you're sure you know what you're doing, supply " +
        #                    "hard_override=True to amp.initialize.")

+
+distributed = False
+if 'WORLD_SIZE' in os.environ:
+    distributed = int(os.environ['WORLD_SIZE']) > 1
+
+
+def maybe_print(msg, rank0=False):
+    if _amp_state.verbosity > 0:
+        if rank0:
+            if distributed:
+                if torch.distributed.get_rank() == 0:
+                    print(msg)
+            else:
+                print(msg)
+        else:
+            print(msg)
+
+
 # def iter_params(param_groups):
 #     for group in param_groups:
 #         for p in group['params']:
 #             yield p

+
 def master_params(optimizer):
    """
    Generator expression that iterates over the params owned by ``optimizer``.

--- a/apex/amp/_initialize.py
+++ b/apex/amp/_initialize.py
 import torch
-from torch._six import container_abcs, string_classes
+from torch._six import string_classes
 import functools
-from ._amp_state import _amp_state, warn_or_err
+import numpy as np
+import warnings
+from ._amp_state import _amp_state, warn_or_err, container_abcs
 from .handle import disable_casts
 from .scaler import LossScaler
+from ._process_optimizer import _process_optimizer
 from apex.fp16_utils import convert_network
 from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
 from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
@@ -15,11 +18,11 @@ def to_type(dtype, t):
    if isinstance(t, torch.Tensor):
        if not t.is_cuda:
            # This should not be a hard error, since it may be legitimate.
-            print("Warning:  An input tensor was not cuda. ")
-        if t.requires_grad:
-            # This should be a hard-ish error.
-            warn_or_err("input data requires grad.  Since input data is not a model parameter,\n"
-                "its gradients will not be properly allreduced by DDP.")
+            warnings.warn("An input tensor was not cuda.")
+        # GANs require this.
+        # if t.requires_grad:
+        #     warn_or_err("input data requires grad.  Since input data is not a model parameter,\n"
+        #         "its gradients will not be properly allreduced by DDP.")
        if t.is_floating_point():
            return t.to(dtype)
        return t
@@ -34,6 +37,8 @@ def applier(value, fn):
        return fn(value)
    elif isinstance(value, string_classes):
        return value
+    elif isinstance(value, np.ndarray):
+        return value
    elif isinstance(value, container_abcs.Mapping):
        return {applier(k, fn) : applier(v, fn) for k, v in value.items()}
    elif isinstance(value, container_abcs.Iterable):
@@ -70,18 +75,32 @@ def check_models(models):
 def check_params_fp32(models):
    for model in models:
        for name, param in model.named_parameters():
-            if param.is_floating_point() and param.type() != "torch.cuda.FloatTensor":
-                warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
-                    "When using amp.initialize, you do not need to call .half() on your model\n"
-                    "before passing it, no matter what optimization level you choose.".format(
-                    name, param.type()))
+            if param.is_floating_point():
+                if 'Half' in param.type():
+                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
+                        "When using amp.initialize, you do not need to call .half() on your model\n"
+                        "before passing it, no matter what optimization level you choose.".format(
+                        name, param.type()))
+                elif not param.is_cuda:
+                    warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
+                        "When using amp.initialize, you need to provide a model with parameters\n"
+                        "located on a CUDA device before passing it no matter what optimization level\n"
+                        "you chose. Use model.to('cuda') to use the default device.".format(
+                        name, param.type()))

        for name, buf in model.named_buffers():
-            if buf.is_floating_point() and buf.type() != "torch.cuda.FloatTensor":
-                warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
-                    "When using amp.initialize, you do not need to call .half() on your model\n"
-                    "before passing it, no matter what optimization level you choose.".format(
-                    name, buf.type()))
+            if buf.is_floating_point():
+                if 'Half' in buf.type():
+                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
+                        "When using amp.initialize, you do not need to call .half() on your model\n"
+                        "before passing it, no matter what optimization level you choose.".format(
+                        name, buf.type()))
+                elif not buf.is_cuda:
+                    warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
+                        "When using amp.initialize, you need to provide a model with buffers\n"
+                        "located on a CUDA device before passing it no matter what optimization level\n"
+                        "you chose. Use model.to('cuda') to use the default device.".format(
+                        name, buf.type()))


 def check_optimizers(optimizers):
@@ -118,13 +137,15 @@ def wrap_fused_adam(optimizer, properties):
        return FP16_Optimizer_for_fused(optimizer, static_loss_scale=properties.loss_scale)


-def _initialize(models, optimizers, properties):
+def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
    from apex.parallel import DistributedDataParallel as apex_DDP
    from .amp import init as amp_init

+    optimizers_was_list = False
    if isinstance(optimizers, torch.optim.Optimizer):
-        optimizers_was_list = False
        optimizers = [optimizers]
+    elif optimizers is None:
+        optimizers = []
    elif isinstance(optimizers, list):
        optimizers_was_list = True
    else:
@@ -140,8 +161,9 @@ def _initialize(models, optimizers, properties):

    check_models(models)

-    check_params_fp32(models)
-
+    if not _amp_state.allow_incoming_model_not_fp32:
+        check_params_fp32(models)
+    
    check_optimizers(optimizers)

    # In the future, when FP16_Optimizer can be deprecated and master weights can
@@ -155,41 +177,54 @@ def _initialize(models, optimizers, properties):
            for model in models:
                model.to(properties.cast_model_type)

-        caster = functools.partial(to_type, properties.cast_model_type)
+        input_caster = functools.partial(to_type, properties.cast_model_type)
+        if cast_model_outputs is not None:
+            output_caster = functools.partial(to_type, cast_model_outputs)
+        else:
+            output_caster = functools.partial(to_type, torch.float32)

-        # Patch the forward method to cast incoming data to the correct type.
-        # I like writing things explicitly more than decorators.
-        def patch_forward(old_fwd):
-            def new_fwd(*args, **kwargs):
-                return old_fwd(*applier(args, caster),
-                               **applier(kwargs, caster))
-            return new_fwd
+        for model in models:
+            # Patch the forward method to cast incoming data to the correct type, and
+            # outgoing data to float32, so "the user never needs to call .half()."
+            # I like writing things explicitly more than decorators.
+            def patch_forward(old_fwd):
+                def new_fwd(*args, **kwargs):
+                    output = old_fwd(*applier(args, input_caster),
+                                     **applier(kwargs, input_caster))
+                    return applier(output, output_caster)
+                return new_fwd

-        model.forward = patch_forward(model.forward)
+            model.forward = patch_forward(model.forward)

        # State dict trick to recast any preexisting per-param state tensors 
        for optimizer in optimizers:
            optimizer.load_state_dict(optimizer.state_dict())
+    elif cast_model_outputs is not None:
+        output_caster = functools.partial(to_type, cast_model_outputs)
+
+        for model in models:
+            def patch_forward(old_fwd):
+                def new_fwd(*args, **kwargs):
+                    output = old_fwd(*args, **kwargs)
+                    return applier(output, output_caster)
+                return new_fwd
+
+            model.forward = patch_forward(model.forward)
+
+    for i, optimizer in enumerate(optimizers):
+        # Still need to special case this for the first pass
+        if isinstance(optimizer, FusedAdam):
+            optimizers[i] = wrap_fused_adam(optimizer, properties)
+        else:
+            optimizers[i] = _process_optimizer(optimizer, properties)

-    if properties.master_weights:
-        for i, optimizer in enumerate(optimizers):
-            if isinstance(optimizer, FusedAdam):
-                optimizers[i] = wrap_fused_adam(optimizer, properties)
-            if properties.loss_scale == "dynamic":
-                optimizers[i] = FP16_Optimizer_general(optimizer,
-                                                       dynamic_loss_scale=True,
-                                                       verbose=False)
-            else:
-                optimizers[i] = FP16_Optimizer_general(optimizer,
-                                                       static_loss_scale=properties.loss_scale,
-                                                       verbose=False)
-    else:
-        for optimizer in optimizers:
-            optimizer.loss_scaler = LossScaler(properties.loss_scale)
+    _amp_state.loss_scalers = []
+    for _ in range(num_losses):
+        _amp_state.loss_scalers.append(LossScaler(properties.loss_scale))

    if properties.patch_torch_functions:
        # handle is unused here. It's accessible later through a global value anyway.
-        handle = amp_init(loss_scale=properties.loss_scale)
+        handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2))
        for optimizer in optimizers:
            # Disable Amp casting for the optimizer step, because it should only be
            # applied to FP32 master params anyway.
@@ -209,6 +244,12 @@ def _initialize(models, optimizers, properties):
            return models[0], optimizers
    else:
        if models_was_list:
-            return models, optimizers[0]
+            if len(optimizers) == 0:
+                return models
+            else:
+                return models, optimizers[0]
        else:
-            return models[0], optimizers[0]
+            if len(optimizers) == 0:
+                return models[0]
+            else:
+                return models[0], optimizers[0]
--- a/apex/amp/_process_optimizer.py
+++ b/apex/amp/_process_optimizer.py
+import types
+from ..fp16_utils import master_params_to_model_params
+from ..multi_tensor_apply import multi_tensor_applier
+from ._amp_state import maybe_print
+import torch
+
+
+class AmpOptimizerState(object):
+    def __init__(self):
+        pass
+
+
+def lazy_init_with_master_weights(self):
+        stash = self._amp_stash
+        stash.fp16_groups = []
+        stash.fp32_from_fp16_groups = []
+        stash.fp32_from_fp32_groups = []
+        for i, param_group in enumerate(self.param_groups):
+            # maybe_print("FP16_Optimizer processing param group {}:".format(i))
+            fp16_params_this_group = []
+            fp32_params_this_group = []
+            fp32_from_fp16_params_this_group = []
+            for i, param in enumerate(param_group['params']):
+                if param.requires_grad:
+                    if param.type() == 'torch.cuda.HalfTensor':
+                        # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
+                        #             .format(param.size()))
+                        fp16_params_this_group.append(param)
+                        master_param = param.detach().clone().float()
+                        master_param.requires_grad = True
+                        param_group['params'][i] = master_param
+                        fp32_from_fp16_params_this_group.append(master_param)
+                        # Reset existing state dict key to the new master param.
+                        # We still need to recast per-param state tensors, if any, to FP32.
+                        if param in self.state:
+                           self.state[master_param] = self.state.pop(param)
+                    elif param.type() == 'torch.cuda.FloatTensor':
+                        # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
+                        #             .format(param.size()))
+                        fp32_params_this_group.append(param)
+                        param_group['params'][i] = param
+                    else:
+                        raise TypeError("Optimizer's parameters must be either "
+                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                        "Received {}".format(param.type()))
+
+            stash.fp16_groups.append(fp16_params_this_group)
+            stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+            stash.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+        stash.all_fp16_params = []
+        for group in stash.fp16_groups:
+            stash.all_fp16_params += group
+
+        stash.all_fp32_from_fp16_params = []
+        for group in stash.fp32_from_fp16_groups:
+            stash.all_fp32_from_fp16_params += group
+
+        stash.all_fp32_from_fp32_params = []
+        for group in stash.fp32_from_fp32_groups:
+            stash.all_fp32_from_fp32_params += group
+
+        # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
+        stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
+
+        for param in stash.all_fp32_from_fp16_params:
+            param.grad = None
+
+        for param in stash.all_fp32_from_fp32_params:
+            param.grad = None
+
+        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+        self.load_state_dict(self.state_dict())
+
+
+def prepare_backward_with_master_weights(self):
+    stash = self._amp_stash
+
+    if not stash.lazy_init_called:
+        self._lazy_init_maybe_master_weights()
+        stash.lazy_init_called = True
+
+    for i, param in enumerate(stash.all_fp16_params):
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+    # for i, param in enumerate(stash.all_fp32_from_fp16_params):
+    #     stash.all_fp32_from_fp16_grad_stash[i] = param.grad
+
+    for i, param in enumerate(stash.all_fp32_from_fp32_params):
+        stash.all_fp32_from_fp32_grad_stash[i] = param.grad
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+
+def post_backward_with_master_weights(self, scaler):
+    stash = self._amp_stash
+
+    # This is a lot of python overhead...
+    fp16_grads_needing_unscale = []
+    new_fp32_grads = []
+    fp16_grads_needing_unscale_with_stash = []
+    preexisting_fp32_grads = []
+    for fp16_param, fp32_param in zip(stash.all_fp16_params,
+                                      stash.all_fp32_from_fp16_params):
+        if fp16_param.grad is None and fp32_param.grad is not None:
+            continue
+        elif fp16_param.grad is not None and fp32_param.grad is None:
+            fp32_param.grad = torch.empty_like(fp32_param) 
+            fp16_grads_needing_unscale.append(fp16_param.grad)
+            new_fp32_grads.append(fp32_param.grad)
+        elif fp16_param.grad is not None and fp32_param.grad is not None:
+            fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
+            preexisting_fp32_grads.append(fp32_param.grad)
+        else: # fp16_param.grad is None and fp32_param.grad is None:
+            continue
+
+    if len(fp16_grads_needing_unscale) > 0:
+        scaler.unscale(
+            fp16_grads_needing_unscale,
+            new_fp32_grads,
+            scaler.loss_scale(),
+            models_are_masters=False)
+
+    if len(fp16_grads_needing_unscale_with_stash) > 0:
+        scaler.unscale_with_stashed(
+            fp16_grads_needing_unscale_with_stash,
+            preexisting_fp32_grads,
+            preexisting_fp32_grads)
+
+    # fp32 params can be treated as they would be in the "no_master_weights" case.
+    grads_needing_unscale = []
+    grads_needing_unscale_with_stash = []
+    stashed = []
+    for param, stashed_grad in zip(stash.all_fp32_from_fp32_params,
+                                   stash.all_fp32_from_fp32_grad_stash):
+        if param.grad is None and stashed_grad is not None:
+            param.grad = stashed_grad
+        elif param.grad is not None and stashed_grad is None:
+            grads_needing_unscale.append(param.grad)
+        elif param.grad is not None and stashed_grad is not None:
+            grads_needing_unscale_with_stash.append(param.grad)
+            stashed.append(stashed_grad)
+        else: # param.grad is None and stashed_grad is None:
+            continue
+
+    if len(grads_needing_unscale) > 0:
+        scaler.unscale(
+            grads_needing_unscale,
+            grads_needing_unscale,
+            scaler.loss_scale(),
+            models_are_masters=True)
+
+    if len(grads_needing_unscale_with_stash) > 0:
+        scaler.unscale_with_stashed(
+            grads_needing_unscale_with_stash,
+            stashed,
+            grads_needing_unscale_with_stash)
+
+    # Clear the stash.
+    for i in range(len(stash.all_fp32_from_fp32_grad_stash)):
+        stash.all_fp32_from_fp32_grad_stash[i] = None
+
+
+def lazy_init_no_master_weights(self):
+    stash = self._amp_stash
+    stash.all_fp16_params = []
+    stash.all_fp32_params = []
+    for i, param_group in enumerate(self.param_groups):
+        for i, param in enumerate(param_group['params']):
+            if param.type() == 'torch.cuda.HalfTensor':
+                stash.all_fp16_params.append(param)
+            elif param.type() == 'torch.cuda.FloatTensor':
+                stash.all_fp32_params.append(param)
+            else:
+                raise TypeError("Optimizer's parameters must be either "
+                                "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+                                "Received {}".format(param.type()))
+    
+    stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
+    stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]
+
+
+def prepare_backward_no_master_weights(self):
+    stash = self._amp_stash
+
+    if not stash.lazy_init_called:
+        self._lazy_init_maybe_master_weights()
+        stash.lazy_init_called = True
+
+    for i, param in enumerate(stash.all_fp16_params):
+        stash.all_fp16_grad_stash[i] = param.grad
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+    for i, param in enumerate(stash.all_fp32_params):
+        stash.all_fp32_grad_stash[i] = param.grad
+        # Set up to leverage grad copy elision:
+        param.grad = None
+
+
+def post_backward_no_master_weights(self, scaler):
+    stash = self._amp_stash
+
+    split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
+             (stash.all_fp32_params, stash.all_fp32_grad_stash))
+
+    for params, stashed_grads in split_types:
+        # This is a lot of python overhead...
+        grads_needing_unscale = []
+        grads_needing_unscale_with_stash = []
+        stashed = []
+        for param, stashed_grad in zip(params, stashed_grads):
+            if param.grad is None and stashed_grad is not None:
+                param.grad = stashed_grad
+            elif param.grad is not None and stashed_grad is None:
+                grads_needing_unscale.append(param.grad)
+            elif param.grad is not None and stashed_grad is not None:
+                grads_needing_unscale_with_stash.append(param.grad)
+                stashed.append(stashed_grad)
+            else: # param.grad is None and stashed_grad is None
+                continue
+
+        if len(grads_needing_unscale) > 0:
+            scaler.unscale(
+                grads_needing_unscale,
+                grads_needing_unscale,
+                scaler.loss_scale(),
+                models_are_masters=True)
+
+        if len(grads_needing_unscale_with_stash) > 0:
+            scaler.unscale_with_stashed(
+                grads_needing_unscale_with_stash,
+                stashed,
+                grads_needing_unscale_with_stash)
+
+        # Clear the stash.
+        for i in range(len(stashed_grads)):
+            stashed_grads[i] = None
+
+
+def _master_params_to_model_params(self):
+    stash = self._amp_stash
+    if multi_tensor_applier.available:
+        if len(stash.all_fp16_params) > 0:
+            multi_tensor_applier(
+                stash.multi_tensor_scale,
+                stash.dummy_overflow_buf,
+                [stash.all_fp32_from_fp16_params, stash.all_fp16_params],
+                1.0)
+    else:
+        for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
+            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+
+def _process_optimizer(optimizer, properties):
+    if hasattr(optimizer, "_amp_stash"):
+        raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
+    else:
+        optimizer._amp_stash = AmpOptimizerState()
+
+    optimizer._amp_stash.lazy_init_called = False
+    optimizer._amp_stash.already_patched = False
+    optimizer._amp_stash.params_have_scaled_gradients = False
+
+    for name in ("_lazy_init_maybe_master_weights",
+                 "_master_params_to_model_params",
+                 "_prepare_amp_backward",
+                 "_post_amp_backward"):
+        if hasattr(optimizer, name):
+            raise RuntimeError("Incoming optimizer already has {} defined.".format(name))
+
+    # TODO:  Centralize exposure and import error checking for the C backend.
+    if multi_tensor_applier.available:
+        import amp_C
+        optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
+        optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);
+
+    if properties.master_weights:
+        optimizer._lazy_init_maybe_master_weights = types.MethodType(
+            lazy_init_with_master_weights, optimizer)
+
+        optimizer._master_params_to_model_params = types.MethodType(
+            _master_params_to_model_params, optimizer)
+
+        old_step = optimizer.step
+        def new_step(self):
+            retval = old_step()
+            self._master_params_to_model_params()
+            # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+            for param in self._amp_stash.all_fp32_from_fp16_params:
+                param.grad = None
+            return retval
+        optimizer.step = types.MethodType(new_step, optimizer)
+
+        old_zero_grad = optimizer.zero_grad
+        def new_zero_grad(self):
+            stash = self._amp_stash
+            if not stash.lazy_init_called:
+                self._lazy_init_maybe_master_weights()
+                stash.lazy_init_called = True
+            # Zero the model grads.
+            for param in stash.all_fp16_params:
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad.zero_()
+            for param in stash.all_fp32_from_fp32_params:
+                if param.grad is not None:
+                    param.grad.detach_()
+                    param.grad.zero_()
+            # Clear the master grads that are independent of model grads
+            for param in self._amp_stash.all_fp32_from_fp16_params:
+                param.grad = None
+        optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
+
+        optimizer._prepare_amp_backward = types.MethodType(
+            prepare_backward_with_master_weights, optimizer)
+
+        optimizer._post_amp_backward = types.MethodType(
+            post_backward_with_master_weights, optimizer)
+    else:
+        optimizer._lazy_init_maybe_master_weights = types.MethodType(
+            lazy_init_no_master_weights, optimizer)
+
+        optimizer._prepare_amp_backward = types.MethodType(
+            prepare_backward_no_master_weights, optimizer)
+
+        optimizer._post_amp_backward = types.MethodType(
+            post_backward_no_master_weights, optimizer)
+
+    return optimizer
--- a/apex/amp/frontend.py
+++ b/apex/amp/frontend.py
 import torch
 from ._initialize import _initialize
-from ._amp_state import _amp_state, warn_or_err
+from ._amp_state import _amp_state, warn_or_err, maybe_print


 class Properties(object):
@@ -53,12 +53,13 @@ class Properties(object):
                # print("setting {} {}".format(name, value))
                if name == "cast_model_type":
                    if self.opt_level == "O1" and value is not None:
-                        if value is not torch.float32:
-                            warn_or_err("O1 inserts casts around Torch functions rather than "
-                                        "model weights, so with O1, the model weights themselves "
-                                        "should remain FP32. If you wish to cast the model to a "
-                                        "different type, use opt_level='O2' or 'O3'. " +
-                                        "cast_model_type was {}".format(value))
+                        if value is not False:
+                            if value is not torch.float32:
+                                warn_or_err("O1 inserts casts around Torch functions rather than "
+                                            "model weights, so with O1, the model weights themselves "
+                                            "should remain FP32. If you wish to cast the model to a "
+                                            "different type, use opt_level='O2' or 'O3'. " +
+                                            "cast_model_type was {}".format(value))
                    self.options[name] = value
                elif name == "patch_torch_functions":
                    if self.opt_level != "O1" and value:
@@ -192,34 +193,45 @@ opt_levels = {"O3": O3(),
 # allow user to directly pass Properties struct as well?
 def initialize(
    models,
-    optimizers,
+    optimizers=None,
    enabled=True,
    opt_level=None,
    cast_model_type=None,
    patch_torch_functions=None,
    keep_batchnorm_fp32=None,
    master_weights=None,
-    loss_scale=None
+    loss_scale=None,
+    cast_model_outputs=None,
+    num_losses=1,
+    verbosity=1,
    ):
    """
    Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
    chosen ``opt_level`` and overridden properties, if any.

-    ``amp.initialize`` must be called **after** you have finished constructing your model(s) and
+    ``amp.initialize`` should be called **after** you have finished
+    constructing your model(s) and
    optimizer(s), but **before** you send your model through any DistributedDataParallel wrapper.
    See `Distributed training`_ in the Imagenet example.

+    Currently, ``amp.initialize`` should only be called **once**,
+    although it can process an arbitrary number of
+    models and optimizers (see the corresponding `Advanced Amp Usage topic`_).
+    If you think your use case requires ``amp.initialize`` to be called more than once,
+    `let us know`_.
+
    Any property keyword argument that is not ``None`` will be interpreted as a manual override.

    To prevent having to rewrite anything else in your script, name the returned models/optimizers
-    to replace the passed models/optimizers, as in the Usage below.
+    to replace the passed models/optimizers, as in the code sample below.

    Args:
        models (torch.nn.Module or list of torch.nn.Modules):  Models to modify/cast.
-        optimizers (torch.optim.Optimizer or list of torch.optim.Optimizers):  Optimizers to modify/cast.
+        optimizers (optional, torch.optim.Optimizer or list of torch.optim.Optimizers):  Optimizers to modify/cast.
+            REQUIRED for training, optional for inference.
        enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops, so your script
            should run as if Amp were not present.
-        opt_level(str, required):  Pure or mixed precision optimization level.  Accepted values are
+        opt_level (str, required):  Pure or mixed precision optimization level.  Accepted values are
            "O0", "O1", "O2", and "O3", explained in detail above.
        cast_model_type (``torch.dtype``, optional, default=None):  Optional property override, see
            above.
@@ -227,15 +239,25 @@ def initialize(
        keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.  If
            passed as a string, must be the string "True" or "False".
        master_weights (bool, optional, default=None):  Optional property override.
-        loss_scale(float or str, default=None):  Optional property override.  If passed as a string,
+        loss_scale (float or str, optional, default=None):  Optional property override.  If passed as a string,
            must be a string representing a number, e.g., "128.0", or the string "dynamic".
+        cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that the outputs
+            of your model(s) are always cast to a particular type regardless of ``opt_level``.
+        num_losses (int, optional, default=1):  Option to tell Amp in advance how many losses/backward
+            passes you plan to use.  When used in conjunction with the ``loss_id`` argument to
+            ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass,
+            which can improve stability.  See "Multiple models/optimizers/losses"
+            under `Advanced Amp Usage`_ for examples.  If ``num_losses`` is left to 1, Amp will still
+            support multiple losses/backward passes, but use a single global loss scale
+            for all of them.
+        verbosity (int, default=1):  Set to 0 to suppress Amp-related output.

    Returns:
        Model(s) and optimizer(s) modified according to the ``opt_level``.
        If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
        also be a list.

-    Usage::
+    Permissible invocations::

        model, optim = amp.initialize(model, optim,...)
        model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
@@ -265,9 +287,20 @@ def initialize(

    .. _`Imagenet example`:
        https://github.com/NVIDIA/apex/tree/master/examples/imagenet
+
+    .. _`Advanced Amp Usage`:
+        https://nvidia.github.io/apex/advanced.html
+
+    .. _`Advanced Amp Usage topic`:
+        https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses
+
+    .. _`let us know`:
+        https://github.com/NVIDIA/apex/issues
    """
+    _amp_state.opt_properties = Properties()
+    _amp_state.verbosity = verbosity
+
    if not enabled:
-        _amp_state.opt_properties = Properties()
        return models, optimizers

    if opt_level not in opt_levels:
@@ -275,16 +308,15 @@ def initialize(
            "Unexpected optimization level {}. ".format(opt_level) +
            "Options are 'O0', 'O1', 'O2', 'O3'.")
    else:
-        _amp_state.opt_properties = opt_levels[opt_level](Properties())
-        print("Selected optimization level {}".format(opt_levels[opt_level].brief))
-        print("Defaults for this optimization level are:")
-        print(_amp_state.opt_properties.options)
+        _amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
+        maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
+        maybe_print("Defaults for this optimization level are:", True)
        for k, v in _amp_state.opt_properties.options.items():
-            print("{:22} : {}".format(k, v))
+            maybe_print("{:22} : {}".format(k, v), True)

-    print("Processing user overrides (additional kwargs that are not None)...")
-    # I chose to have the keyword arguments listed directly in the argument list, so I
-    # can't use kwargs.items() here.
+    maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
+    # I chose to have the keyword arguments listed directly in the argument list,
+    # instead of **kwargs, so I can't use kwargs.items() here.
    if enabled is not None:
        _amp_state.opt_properties.enabled = enabled
    if opt_level is not None:
@@ -300,11 +332,11 @@ def initialize(
    if loss_scale is not None:
        _amp_state.opt_properties.loss_scale = loss_scale

-    print("After processing overrides, optimization options are:")
+    maybe_print("After processing overrides, optimization options are:", True)
    for k, v in _amp_state.opt_properties.options.items():
-        print("{:22} : {}".format(k, v))
+        maybe_print("{:22} : {}".format(k, v), True)

-    return _initialize(models, optimizers, _amp_state.opt_properties)
+    return _initialize(models, optimizers, _amp_state.opt_properties, num_losses, cast_model_outputs)


 # TODO:  is this necessary/useful?

--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
 import contextlib
-import logging
 import warnings
 import torch

 from . import utils
 from .opt import OptimWrapper
 from .scaler import LossScaler
-from ._amp_state import _amp_state, master_params
+from ._amp_state import _amp_state, master_params, maybe_print
 from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
 from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused

@@ -14,7 +13,8 @@ from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
 # There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
 @contextlib.contextmanager
 def scale_loss(loss,
-               optimizer,
+               optimizers,
+               loss_id=0,
               model=None,
               delay_unscale=False):
    """
@@ -38,41 +38,60 @@ def scale_loss(loss,
        unscaled.  The direct ``.grad`` attributes of any FP16
        model params will remain scaled after context manager exit.
        This subtlety affects gradient clipping.  See "Gradient clipping" under
-        "Advanced use cases" for best practices.
+        `Advanced Amp Usage`_ for best practices.

    Args:
        loss(Tensor):  Typically a scalar Tensor. The ``scaled_loss`` that the context
            manager yields is simply ``loss.float()*loss_scale``, so in principle
            ``loss`` could have more than one element, as long as you call
            ``backward()`` on ``scaled_loss`` appropriately within the context manager body.
-        optimizer:  Must be an optimizer returned from an earlier call to ``amp.initialize``.
+        optimizers:  All optimizer(s) for which the current backward pass is creating gradients.
+            Must be an optimizer or list of optimizers returned from an earlier call
+            to ``amp.initialize``.  For example use with multiple optimizers, see
+            "Multiple models/optimizers/losses" under `Advanced Amp Usage`_.
+        loss_id(int, optional, default=0):  When used in conjunction with the ``num_losses`` argument
+            to ``amp.initialize``, enables Amp to use a different loss scale per loss.  ``loss_id``
+            must be an integer between 0 and ``num_losses`` that tells Amp which loss is
+            being used for the current backward pass.  See "Multiple models/optimizers/losses"
+            under `Advanced Amp Usage`_ for examples.  If ``loss_id`` is left unspecified, Amp
+            will use the default global loss scaler for this backward pass.
        model(torch.nn.Module, optional, default=None):  Currently unused, reserved to enable future
            optimizations.
-        delay_unscale(bool, default=False):  Don't unscale the gradients or perform model->master
-            gradient copies on context manager exit.  "Advanced use cases" illustrates
-            situations where this is necessary.
-
-    .. warning::If ``True``, ``optimizer.step()`` cannot be
-            called yet after context manager exit, and must wait for another, later backward context
-            manager invocation with ``delay_unscale`` left to False.
-            See "Advanced use cases" for examples.
+        delay_unscale(bool, optional, default=False):  ``delay_unscale`` is never necessary, and
+            the default value of ``False`` is strongly recommended.
+            If ``True``, Amp will not unscale the gradients or perform model->master
+            gradient copies on context manager exit.
+            ``delay_unscale=True`` is a minor ninja performance optimization and can result
+            in weird gotchas (especially with multiple models/optimizers/losses),
+            so only use it if you know what you're doing.
+            "Gradient accumulation across iterations" under `Advanced Amp Usage`_
+            illustrates a situation where this CAN (but does not need to) be used.
+
+    .. warning::
+        If ``delay_unscale`` is ``True`` for a given backward pass, ``optimizer.step()`` cannot be
+        called yet after context manager exit, and must wait for another, later backward context
+        manager invocation with ``delay_unscale`` left to False.
+
+    .. _`Advanced Amp Usage`:
+        https://nvidia.github.io/apex/advanced.html
    """
    if not _amp_state.opt_properties.enabled:
        yield loss
        return

-    if optimizer.loss_scaler is None:
-        raise RuntimeError("optimizer passed to scale_loss does not have a loss_scaler.")
+    if isinstance(optimizers, torch.optim.Optimizer):
+        optimizers = [optimizers]

    # this is what happens when i have to support tools from different sources under the same API...
    # TODO:  Rewrite FusedAdam to use multi-tensor apply and the same loss scaler.
-    if isinstance(optimizer, FP16_Optimizer_for_fused):
-        loss_scale = optimizer.cur_scale
+    if isinstance(optimizers, FP16_Optimizer_for_fused):
+        loss_scale = optimizers.cur_scale
    else:
-        loss_scale = optimizer.loss_scaler.loss_scale()
+        loss_scaler = _amp_state.loss_scalers[loss_id]
+        loss_scale = loss_scaler.loss_scale()

    if ((not _amp_state.opt_properties.master_weights)
-        and (not optimizer.loss_scaler.dynamic)
+        and (not loss_scaler.dynamic)
        and loss_scale == 1.0):
        yield loss.float()
        # Needing to drop the cache here as well is an ugly gotcha.
@@ -82,35 +101,48 @@ def scale_loss(loss,
            _amp_state.handle._clear_cache()
        return

+    if not delay_unscale:
+        if isinstance(optimizers, list):
+            for optimizer in optimizers:
+                if not optimizer._amp_stash.params_have_scaled_gradients:
+                    optimizer._prepare_amp_backward()
+
    yield (loss.float())*loss_scale

-    # this isn't pretty but it unifies things.  Once I deprecate the old API entirely,
-    # I will have freedom to clean this up.  Maybe instead of wrapping optimizers,
-    # I can simply construct a set of attributes (e.g. master params) and assign them
-    # directly to optimizer instances.
-    if not delay_unscale:
-        # The FP16_Optimizer for FusedAdam will take care of unscaling as part of
-        # its step() method.
-        if not isinstance(optimizer, FP16_Optimizer_for_fused):
-            if isinstance(optimizer, FP16_Optimizer_general):
-                optimizer.update_master_grads()
-            else:
-                optimizer.loss_scaler.clear_overflow_state()
-                optimizer.loss_scaler.unscale(
-                    master_params(optimizer),
-                    master_params(optimizer),
-                    loss_scale)
-                # For future fused optimizers that enable sync-free dynamic loss scaling,
-                # should_skip will always be False.
-                should_skip = optimizer.loss_scaler.update_scale()
-                if should_skip:
-                    optimizer_step = optimizer.step
-                    def skip_step():
-                        logger = logging.getLogger('apex.amp')
-                        logger.warning("Gradient overflow.  Skipping step, reducing " +
-                                       "loss scale to {}".format(optimizer.loss_scaler.loss_scale()))
-                        optimizer.step = optimizer_step
-                    optimizer.step = skip_step
+    if delay_unscale:
+        for optimizer in optimizers:
+            optimizer._amp_stash.params_have_scaled_gradients = True
+    else:
+        # FusedAdam and FusedSGD will take care of unscaling as part of their step() methods.
+        if not isinstance(optimizers, FP16_Optimizer_for_fused):
+            loss_scaler.clear_overflow_state()
+            for optimizer in optimizers:
+                optimizer._post_amp_backward(loss_scaler)
+                optimizer._amp_stash.params_have_scaled_gradients = False
+            # For future fused optimizers that enable sync-free dynamic loss scaling,
+            # should_skip will always be False.
+            should_skip = loss_scaler.update_scale()
+            if should_skip:
+                for optimizer in optimizers:
+                    if not optimizer._amp_stash.already_patched:
+                        # Close on loss_scaler and loss_id as well, to be safe.  Probably not
+                        # necessary because amp.scale_loss is already creating a temporary scope.
+                        def patch_step(opt, loss_scaler, loss_id):
+                            opt_step = opt.step
+                            def skip_step():
+                                maybe_print(("Gradient overflow.  Skipping step, loss scaler " +
+                                             "{} reducing loss scale to {}").format(loss_id,
+                                             loss_scaler.loss_scale()))
+                                if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
+                                    # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+                                    for param in opt._amp_stash.all_fp32_from_fp16_params:
+                                        param.grad = None
+                                opt.step = opt_step
+                                opt._amp_stash.already_patched = False
+                            return skip_step
+                        optimizer.step = patch_step(optimizer, loss_scaler, loss_id)
+                        optimizer._amp_stash.already_patched = True
+
    # Probably ok to skip this if not delay_unscale
    if _amp_state.opt_properties.patch_torch_functions:
        _amp_state.handle._clear_cache()
@@ -149,6 +181,10 @@ class AmpHandle(object):

    @contextlib.contextmanager
    def scale_loss(self, loss, optimizer):
+        raise RuntimeError("The old Amp API is no longer supported.  Please move to the new API, "
+            "documented here:  https://nvidia.github.io/apex/amp.html.  Transition guide:  "
+            "https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users")
+
        if not self.is_active():
            yield loss
            return
@@ -171,8 +207,7 @@ class AmpHandle(object):
        if should_skip:
            optimizer_step = optimizer.step
            def skip_step():
-                logger = logging.getLogger('apex.amp')
-                logger.warning('Gradient overflow, skipping update')
+                maybe_print('Gradient overflow, skipping update')
                optimizer.step = optimizer_step
            optimizer.step = skip_step


--- a/apex/amp/lists/functional_overrides.py
+++ b/apex/amp/lists/functional_overrides.py
@@ -27,6 +27,10 @@ FP16_FUNCS = [
 ]

 FP32_FUNCS = [
+
+    # Interpolation/Upsampling
+    'interpolate',
+
    # Pointwise
    'softplus',
    'softmin',

--- a/apex/amp/lists/torch_overrides.py
+++ b/apex/amp/lists/torch_overrides.py
 import torch

+from .. import utils
+
 MODULE = torch

 FP16_FUNCS = [
-    # Math
-    # TODO: why are these in top-level torch namespace?
+    # Low level functions wrapped by torch.nn layers.
+    # The wrapper layers contain the weights which are then passed in as a parameter
+    # to these functions.
    'conv1d',
    'conv2d',
    'conv3d',
@@ -12,6 +15,7 @@ FP16_FUNCS = [
    'conv_transpose2d',
    'conv_transpose3d',
    'conv_tbc',
+    'prelu',

    # BLAS
    'addmm',
@@ -20,10 +24,8 @@ FP16_FUNCS = [
    'matmul',
    'mm',
    'mv',
-
 ]

-# TODO: ban in-place versions of these in fp16
 FP32_FUNCS = [
    # Pointwise
    'acos',
@@ -54,15 +56,21 @@ FP32_FUNCS = [
    'sum',
    'var',

-    # Special reduction-like BLAS
-    'addbmm',
-    'baddbmm',
-    'bmm',
-
    # Misc
    'renorm'
 ]

+# Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
+# check the CUDA version -- if at least 9.1, then put the bmm
+# functions on the fp16 list. Otherwise, put them on the fp32 list.
+_bmms = ['addbmm',
+         'baddbmm',
+         'bmm']
+if utils.get_cuda_version() >= (9, 1, 0):
+    FP16_FUNCS.extend(_bmms)
+else:
+    FP32_FUNCS.extend(_bmms)
+
 # Multi-tensor fns that may need type promotion
 CASTS = [
    # Multi-tensor math
@@ -86,8 +94,9 @@ CASTS = [
    'ne'
 ]

-# Will possibly need to promote *all* elements of `seq`
+# Functions that take sequence arguments. We need to inspect the whole
+# sequence and cast to the widest type.
 SEQUENCE_CASTS = [
-    'cat', # torch.cat(seq, dim=0, out=None)
-    'stack' # torch.stack(seq, dim=0, out=None)
+    'cat',
+    'stack'
 ]
--- a/apex/amp/opt.py
+++ b/apex/amp/opt.py
 import contextlib
-import logging
 import warnings

 from .scaler import LossScaler, master_params
+from ._amp_state import maybe_print

 import numpy as np

@@ -71,8 +71,7 @@ class OptimWrapper(object):
                'The `closure` argument is unsupported by the amp ' +
                'optimizer wrapper.')
        if any(self._skip_next):
-            logger = logging.getLogger('apex.amp')
-            logger.info('Gradient overflow, skipping update')
+            maybe_print('Gradient overflow, skipping update')
            self._skip_next = [False] * self._num_loss
        else:
            return self._optimizer.step(closure=closure)

--- a/apex/amp/scaler.py
+++ b/apex/amp/scaler.py
 import torch
-import logging
 from ..multi_tensor_apply import multi_tensor_applier
-from ._amp_state import _amp_state, master_params
+from ._amp_state import _amp_state, master_params, maybe_print
 from itertools import product

-# from apex_C import scale_check_overflow
-
-def scale_check_overflow_python(model_grad, scale, master_grad, check_overflow=False):
+def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
    # Exception handling for 18.04 compatibility
    if check_overflow:
        cpu_sum = float(model_grad.float().sum())
@@ -19,6 +16,21 @@ def scale_check_overflow_python(model_grad, scale, master_grad, check_overflow=F
        master_grad.mul_(scale)
    return False

+def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, check_overflow=False):
+    # Exception handling for 18.04 compatibility
+    if check_overflow:
+        cpu_sum = float(model_grad.float().sum())
+        if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+            return True
+
+    # if master_grad is not model_grad: # copy_ probably internally short-circuits this
+    #     master_grad.copy_(model_grad)
+    assert stashed_grad.dtype == master_grad.dtype
+    converted_model_grad = model_grad.to(master_grad.dtype)
+    stashed_grad.add_(scale, converted_model_grad)
+    master_grad.data = stashed_grad.data
+    return False
+
 class LossScaler(object):
    warned_no_fused_kernel = False
    warned_unscaling_non_fp32_grad = False
@@ -44,121 +56,139 @@ class LossScaler(object):
            import amp_C
            LossScaler.has_fused_kernel = multi_tensor_applier.available
            LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
+            LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
        else:
            if not LossScaler.warned_no_fused_kernel:
-                print("Warning:  multi_tensor_applier fused unscale kernel is unavailable, "
-                      "possibly because apex was installed without --cuda_ext --cpp_ext. "
-                      "Using Python fallback.  Original ImportError was: ",
-                      multi_tensor_applier.import_err)
+                maybe_print(
+                    "Warning:  multi_tensor_applier fused unscale kernel is unavailable, "
+                    "possibly because apex was installed without --cuda_ext --cpp_ext. "
+                    "Using Python fallback.  Original ImportError was: " +
+                    repr(multi_tensor_applier.import_err),
+                    True)
            LossScaler.has_fused_kernel = False
            LossScaler.warned_no_fused_kernel = True

    def loss_scale(self):
        return self._loss_scale

-    def unscale_grads_python(self, model_grads, master_grads, scale):
+    def unscale_python(self, model_grads, master_grads, scale):
        for model, master in zip(model_grads, master_grads):
            if model is not None:
                if not LossScaler.warned_unscaling_non_fp32_grad:
-                    if master.type() != "torch.cuda.FloatTensor":
-                        logger = logging.getLogger("apex.amp")
-                        logger.warning(
+                    if master.dtype != torch.float32:
+                        maybe_print(
                            "Attempting to unscale a grad with type {} ".format(master.type()) +
                            "Unscaling non-fp32 grads may indicate an error. "
                            "When using Amp, you don't need to call .half() on your model.")
                        LossScaler.warned_unscaling_non_fp32_grad = True
-                self._has_overflow = scale_check_overflow_python(
-                    model,
-                    1./scale,
-                    master,
-                    self.dynamic)
+                self._has_overflow = scale_check_overflow_python(model,
+                                                                 master,
+                                                                 1./scale,
+                                                                 self.dynamic)
                if self._has_overflow and self.dynamic:
                    break

-    def clear_overflow_state(self):
-        self._has_overflow = False
-        if self.has_fused_kernel:
-            self._overflow_buf.zero_()
-
-    def unscale(self, model_params, master_params, scale):
+    # unused_scale keeps some of the old API alive for hopefully a short time.
+    def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False):
        if self._has_overflow:
            return

-        # Lots of defensive list processing going on here.  Way more less efficient than
-        # consuming the iterator directly.  Need to examine Python overhead.
-        model_master_params = [(model, master) for model, master
-            in zip(model_params, master_params)] # some of these may be None
+        scale = self._loss_scale
+
+        if scale == 1.0 and models_are_masters and not self.dynamic:
+            return

        if LossScaler.has_fused_kernel:
-            # TODO:  Make these lists permanent attributes of self, so they don't need to be created
-            # or garbage collected.  Profiler shows that garbage collection overhead may be
-            # substantial (200-300 usec).
-            # This may be tricky because right now the lists need to be packed densely.
-            # Maybe this could be handled within the multi_tensor_apply wrapper
-            # (allow some Tensors to be None using at::optional).
-            src_dst_pairs = {torch.float16 : {torch.float16 : [[],[]], torch.float32 : [[],[]]},
-                             torch.float32 : {torch.float16 : [[],[]], torch.float32 : [[],[]]}}
-
-            for model, master in model_master_params:
-                # Sync the None-ness of model and master params
-                if model.grad is None and master.grad is not None:
-                    master.grad = None
-                if model.grad is not None and master.grad is None:
-                    master.grad = torch.empty_like(master)
-
-                if model.grad is not None:
-                    if model.grad is master.grad and scale == 1.0 and not self.dynamic:
-                        continue
-                    else:
-                        src_dst_pairs[model.dtype][master.dtype][0].append(model.grad.data)
-                        src_dst_pairs[model.dtype][master.dtype][1].append(master.grad.data)
-
-            assert len(src_dst_pairs[torch.float32][torch.float16][0]) == 0, "The loss scaler is "\
-                "being asked to unscale FP32 model gradients into FP16 master gradients.  This is "\
-                "almost certainly an error."
-
-            for src, dst in product((torch.float16, torch.float32),
-                                    (torch.float16, torch.float32)):
-                if len(src_dst_pairs[src][dst][0]) > 0:
-                    if not LossScaler.warned_unscaling_non_fp32_grad and dst is torch.float16:
-                        print("Warning:  unscaling grads that are not FP32. "
-                              "Unscaling non-fp32 grads may indicate an error. "
-                              "When using Amp, you don't need to call .half() on your model.")
-                        # Setting this to True unconditionally allows the possibility of an escape
-                        # if never-before-seen non-fp32 grads are created in some later iteration.
+            # if (not LossScaler.warned_unscaling_non_fp32_grad
+            #     and master_grads[0].dtype == torch.float16):
+            #     print("Warning:  unscaling grads that are not FP32. "
+            #           "Unscaling non-fp32 grads may indicate an error. "
+            #           "When using Amp, you don't need to call .half() on your model.")
+            #     # Setting this to True unconditionally allows the possibility of an escape
+            #     # if never-before-seen non-fp32 grads are created in some later iteration.
+            #     LossScaler.warned_unscaling_non_fp32_grad = True
+            multi_tensor_applier(LossScaler.multi_tensor_scale_cuda,
+                                 self._overflow_buf,
+                                 [model_grads, master_grads],
+                                 1./scale)
+        else:
+            self.unscale_python(model_grads, master_grads, scale)
+
+        # Defer to update_scale
+        # If the fused kernel is available, we only need one D2H memcopy and sync.
+        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+        #     self._has_overflow = self._overflow_buf.item()
+
+    def unscale_with_stashed_python(self,
+                                    model_grads,
+                                    stashed_master_grads,
+                                    master_grads,
+                                    scale):
+        for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
+            if model is None and stashed is None:
+                continue
+            else:
+                if not LossScaler.warned_unscaling_non_fp32_grad:
+                    if master.dtype != torch.float32:
+                        maybe_print(
+                            "Attempting to unscale a grad with type {} ".format(master.type()) +
+                            "Unscaling non-fp32 grads may indicate an error. "
+                            "When using Amp, you don't need to call .half() on your model.")
                        LossScaler.warned_unscaling_non_fp32_grad = True
-                    multi_tensor_applier(
-                        LossScaler.multi_tensor_scale_cuda,
-                        self._overflow_buf,
-                        src_dst_pairs[src][dst],
-                        1./scale)
+                self._has_overflow = axpby_check_overflow_python(model,
+                                                                 stashed,
+                                                                 master,
+                                                                 1./scale,
+                                                                 self.dynamic)
+                if self._has_overflow and self.dynamic:
+                    break
+
+    def unscale_with_stashed(self,
+                             model_grads,
+                             stashed_master_grads,
+                             master_grads):
+        if self._has_overflow:
+            return
+
+        scale = self._loss_scale
+
+        if LossScaler.has_fused_kernel:
+            if (not LossScaler.warned_unscaling_non_fp32_grad
+                and master_grads[0].dtype == torch.float16):
+                print("Warning:  unscaling grads that are not FP32. "
+                      "Unscaling non-fp32 grads may indicate an error. "
+                      "When using Amp, you don't need to call .half() on your model.")
+                # Setting this to True unconditionally allows the possibility of an escape
+                # if never-before-seen non-fp32 grads are created in some later iteration.
+                LossScaler.warned_unscaling_non_fp32_grad = True
+            multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
+                                 self._overflow_buf,
+                                 [model_grads, stashed_master_grads, master_grads],
+                                 1./scale,
+                                 1.0,
+                                 0) # check only arg 0, aka the incoming model grads, for infs
        else:
-            # Sync the None-ness of model and master params.
-            all_same = True
-            for model, master in model_master_params:
-                if model.grad is None and master.grad is not None:
-                    master.grad = None
-                if model.grad is not None and master.grad is None:
-                    master.grad = torch.empty_like(master)
-                if model.grad is not master.grad:
-                    all_same = False
-
-            if scale == 1.0 and all_same and not self.dynamic:
-                return
-
-            # TODO:  Make these lists permanent attributes of self, so they don't need to be created
-            # or garbage collected?
-            model_grads = [mmp[0].grad.data for mmp in model_master_params if mmp[0].grad is not None]
-            master_grads = [mmp[1].grad.data for mmp in model_master_params if mmp[1].grad is not None]
-
-            self.unscale_grads_python(model_grads, master_grads, scale)
+            self.unscale_with_stashed_python(model_grads,
+                                             stashed_master_grads,
+                                             master_grads,
+                                             scale)

+        # Defer to update_scale
        # If the fused kernel is available, we only need one D2H memcopy and sync.
-        if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
-            self._has_overflow = self._overflow_buf.item()
+        # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+        #     self._has_overflow = self._overflow_buf.item()
+
+    def clear_overflow_state(self):
+        self._has_overflow = False
+        if self.has_fused_kernel:
+            self._overflow_buf.zero_()

    # Separate so unscale() can be called more that once before updating.
    def update_scale(self):
+        # If the fused kernel is available, we only need one D2H memcopy and sync.
+        if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+            self._has_overflow = self._overflow_buf.item()
+
        if self._has_overflow and self.dynamic:
            should_skip = True
            self._loss_scale /= 2.

--- a/apex/amp/utils.py
+++ b/apex/amp/utils.py
@@ -5,6 +5,9 @@ import itertools

 import torch

+def get_cuda_version():
+    return tuple(int(x) for x in torch.version.cuda.split('.'))
+
 def is_fp_tensor(x):
    if is_nested(x):
        # Fast-fail version of all(is_fp_tensor)

--- a/apex/fp16_utils/fp16_optimizer.py
+++ b/apex/fp16_utils/fp16_optimizer.py
@@ -4,6 +4,7 @@ from torch.autograd import Variable
 from torch.nn.parameter import Parameter
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

+from ..amp._amp_state import _amp_state, maybe_print
 from ..amp.scaler import LossScaler
 from ..multi_tensor_apply import multi_tensor_applier
 from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
@@ -193,6 +194,8 @@ class FP16_Optimizer(object):
            self.multi_tensor_scale = amp_C.multi_tensor_scale
            self._dummy_overflow_buf = torch.cuda.IntTensor([0]);

+    # Having self.maybe_print distinct from _amp_state.maybe_print is another artifact
+    # of having to support FP16_Optimizer separately, for the time being.
    def maybe_print(self, msg):
        if self.verbose:
            print(msg)
@@ -401,8 +404,9 @@ class FP16_Optimizer(object):
        # self._update_scale(self.overflow)

        if self.overflow:
-            print("Gradient overflow.  Skipping step, reducing " +
-                  "loss scale to {}".format(self.loss_scaler.loss_scale()))
+            # Using _amp_state.maybe_print instead of self.print here is intentional.
+            maybe_print("Gradient overflow.  Skipping step, reducing " +
+                "loss scale to {}".format(self.loss_scaler.loss_scale()))
            return
        
        if closure is not None:
@@ -536,18 +540,37 @@ class FP16_Optimizer(object):
        if len(self.all_fp16_params) > 0:
            # print("Model grads before")
            # print([param.grad.data for param in self.all_fp16_params])
+            # I'm ONLY writing this as an incremental way to make some tests pass until
+            # I can refactor the tests as well.
+            # FP16_Optimizer should not be used by anyone.
+            model_grads = []
+            master_grads = []
+            for model_param, master_param in zip(self.all_fp16_params,
+                                                 self.all_fp32_from_fp16_params):
+                if model_param.grad is not None:
+                    model_grads.append(model_param.grad)
+                    if master_param.grad is None:
+                        master_param.grad = torch.empty_like(master_param)
+                    master_grads.append(master_param.grad)
            self.loss_scaler.unscale(
-                self.all_fp16_params,
-                self.all_fp32_from_fp16_params,
+                model_grads,
+                master_grads,
                self.loss_scaler.loss_scale())
            # print("Master grads after")
            # print([param.grad.data for param in self.all_fp32_from_fp16_params])
        if len(self.all_fp32_from_fp32_params) > 0:
+            model_grads = []
+            master_grads = []
+            for model_param, master_param in zip(self.all_fp32_from_fp32_params,
+                                                 self.all_fp32_from_fp32_params):
+                if model_param.grad is not None:
+                    model_grads.append(model_param.grad)
+                    master_grads.append(master_param.grad)
            # print("Model grads before")
            # print([param.grad.data for param in self.all_fp32_from_fp32_params])
            self.loss_scaler.unscale(
-                self.all_fp32_from_fp32_params,
-                self.all_fp32_from_fp32_params,
+                model_grads,
+                master_grads,
                self.loss_scaler.loss_scale())
            # print("Master grads after")
            # print([param.grad.data for param in self.all_fp32_from_fp32_params])

--- a/apex/normalization/fused_layer_norm.py
+++ b/apex/normalization/fused_layer_norm.py
@@ -3,6 +3,7 @@ import torch
 import numbers
 from torch.nn.parameter import Parameter
 from torch.nn import init
+from torch.nn import functional as F
 import importlib

 class FusedLayerNormAffineFunction(torch.autograd.Function):
@@ -144,6 +145,9 @@ class FusedLayerNorm(torch.nn.Module):
            init.zeros_(self.bias)

    def forward(self, input):
+        if not input.is_cuda:
+            return  F.layer_norm(
+                input, self.normalized_shape, self.weight, self.bias, self.eps)
        if self.elementwise_affine:
          return FusedLayerNormAffineFunction(self.normalized_shape,self.eps)(
              input, self.weight, self.bias)

--- a/apex/optimizers/fp16_optimizer.py
+++ b/apex/optimizers/fp16_optimizer.py
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-import ctypes
-
-stashed_err = None
-try:
-    lib = ctypes.cdll.LoadLibrary(None)
-    lib.THCudaHalfTensor_normall.argtypes=[ctypes.c_void_p, ctypes.c_void_p]
-    lib.THCudaHalfTensor_normall.restype = ctypes.c_float
-    def fused_norm(input):
-        if input.type() == 'torch.cuda.HalfTensor':
-            # 16384 is half 2 if you stare at it long enough
-            return lib.THCudaHalfTensor_normall(torch.cuda._state_cdata,
-                input._cdata, 16384)
-        else:
-            return input.norm()
-except TypeError as err:
-    stashed_err = err
-    def fused_norm(input):
-        raise RuntimeError("Failed to create fused_norm. This may happen on Windows "
-              "because of lib = ctypes.cdll.LoadLibrary(None):  you can't "
-              "LoadLibrary with None.  Original exception message was ",
-              stashed_err)
-
-
-class FP16_Optimizer(object):
-    """
-    :class:`FP16_Optimizer` A cutdown version of apex.fp16_utils.FP16_Optimizer.
-    Designed only to wrap apex.optimizers.FusedAdam.
-    Refer to apex.fp16_utils documents for more information.
-
-    Example::
-
-        model = torch.nn.Linear(D_in, D_out).cuda().half()
-        optimizer = apex.optimizers.FusedAdam(model.parameters())
-        # Name the FP16_Optimizer instance to replace the existing optimizer
-        # (recommended but not required):
-        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-        ...
-        # loss.backward() becomes:
-        optimizer.backward(loss)
-        ...
-
-    Example with dynamic loss scaling::
-
-        ...
-        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-                                   # optional arg to control dynamic loss scaling behavior
-                                   # dynamic_loss_args={'scale_window' : 500})
-                                   # Usually, dynamic_loss_args is not necessary.
-    """
-
-    def __init__(self,
-                 init_optimizer,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=True):
-
-        # The fused optimizer does all the work. We need this layer for two reason:
-        # 1. maintain same user API from apex.fp16_utils
-        # 2. keep common stuff here in case we need to add new fused optimizer later
-
-        # differences from apex.fp16_utils:
-        # - assume all model params in fp16
-        # - assume all params requires grad
-        # - flat by groups, not keeping state. TODO: remove state explicitly?
-        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-        self.optimizer = init_optimizer
-
-        # param flattened by groups
-        self.fp16_groups = []
-        self.fp16_groups_flat = []
-        self.fp32_groups_flat = []
-
-        # loop to deal with groups
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            # push this group to list before modify
-            self.fp16_groups.append(param_group['params'])
-            # init fp16 weight buffer, flattened
-            self.fp16_groups_flat.append(_flatten_dense_tensors([p.clone().detach() for p in self.fp16_groups[i]]))
-            # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
-            for p,q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-            # init master weight, flattened
-            self.fp32_groups_flat.append(self.fp16_groups_flat[i].clone().float().detach())
-            # modify optimizer of have flat master weight
-            self.fp32_groups_flat[i].requires_grad = True # keep this in case internal optimizer uses it
-            param_group['params'] = [self.fp32_groups_flat[i]]
-
-        # we may have a way of fusing dynamic scale. Do not support for now
-        if dynamic_loss_scale:
-            if dynamic_loss_args is not None:
-                raise SystemError("Do not support dynamic loss scale args for now.")
-            self.dynamic_loss_scale = True
-            self.cur_scale = 2**16
-            self.cur_iter = 0
-            self.last_overflow_iter = -1
-            self.scale_factor = 2
-            self.scale_window = 1000
-        else:
-            self.dynamic_loss_scale = False
-            self.cur_iter = 0
-            self.cur_scale = static_loss_scale
-
-    def zero_grad(self, set_grads_to_None=True):
-        """
-        Zero FP16 parameter grads.
-        """
-        # FP32 grad should never exist.
-        # For speed, set model fp16 grad to None by default
-        for group in self.fp16_groups:
-            for p in group:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-    def _compute_grad_norm(self, fp16_grads_flat, norm_type=2):
-        """
-        Compute fp16 grad norm for later clipping(fused with update).
-        Internal accumulated in fp32.
-        Also fused in NaN check. Possibly other reduction needed for grad.
-
-        Args:
-            fp16_grads_flat (tensor): fp16 grad flattened
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the current fp16 gradients (viewed as a single vector).
-            Returns -1 if the most recently computed fp16 gradients overflowed
-        """
-        # TODO: currently using pre-1.0 api, and not most efficient with copy to cpu and sync
-        # only support 2-norm now
-        norm = float(fused_norm(fp16_grads_flat))
-        if norm == float('inf') or norm == -float('inf') or norm != norm:
-            return -1
-        else:
-            return norm
-
-    def step(self, closure=None):
-        """
-        Not supporting closure.
-        """
-        # First compute norm for all group so we know if there is overflow
-        grads_groups_flat = []
-        norm_groups = []
-        skip = False
-        for i, group in enumerate(self.fp16_groups):
-            grads_groups_flat.append(_flatten_dense_tensors([p.grad for p in group]))
-            norm_groups.append(self._compute_grad_norm(grads_groups_flat[i]))
-            if norm_groups[i] == -1: #TODO: early break
-                skip = True
-
-        if skip:
-            self._update_scale(skip)
-            return
-
-        # norm is in fact norm*cur_scale
-        self.optimizer.step(grads=[[g] for g in grads_groups_flat],
-                            output_params=[[p] for p in self.fp16_groups_flat],
-                            scale=self.cur_scale,
-                            grad_norms=norm_groups)
-
-        # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
-            for p,q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-
-        self._update_scale(False)
-        return
-
-    def backward(self, loss):
-        """
-        :attr:`backward` performs the following steps:
-
-        1. fp32_loss = loss.float()
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
-        """
-        scaled_loss = (loss.float()) * self.cur_scale
-        scaled_loss.backward()
-
-    def _update_scale(self, skip):
-        if self.dynamic_loss_scale:
-            if skip:
-                print("\nGrad overflow on iteration", self.cur_iter)
-                print("Using dynamic loss scale of", self.cur_scale)
-                self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
-                self.last_overflow_iter = self.cur_iter
-            else:
-                if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
-                    self.cur_scale *= self.scale_factor
-        else:
-            if skip:
-                print("\nGrad overflow on iteration", self.cur_iter)
-                print("Using static loss scale of", self.cur_scale)
-        self.cur_iter +=1
-        return
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-    
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['cur_scale'] = self.cur_scale
-        state_dict['cur_iter'] = self.cur_iter
-        if state_dict['dynamic_loss_scale']:
-            state_dict['last_overflow_iter'] = self.last_overflow_iter
-            state_dict['scale_factor'] = self.scale_factor
-            state_dict['scale_window'] = self.scale_window
-        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
-        state_dict['fp32_groups_flat'] = self.fp32_groups_flat
-        return state_dict
-
-    def load_state_dict(self, state_dict):
-        """
-        Loads a state_dict created by an earlier call to state_dict(). 
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``, 
-        whose parameters in turn came from ``model``, it is expected that the user 
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-        Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
-        self.cur_scale = state_dict['cur_scale']
-        self.cur_iter = state_dict['cur_iter']
-        if state_dict['dynamic_loss_scale']:
-            self.last_overflow_iter = state_dict['last_overflow_iter']
-            self.scale_factor = state_dict['scale_factor']
-            self.scale_window = state_dict['scale_window']
-        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 2.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-        for current, saved in zip(self.fp32_groups_flat, state_dict['fp32_groups_flat']):
-            current.data.copy_(saved.data)
-
+import torch
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+class FP16_Optimizer(object):
+    """
+    :class:`FP16_Optimizer` A cutdown version of apex.fp16_utils.FP16_Optimizer.
+    Designed only to wrap apex.optimizers.FusedAdam.
+    Refer to apex.fp16_utils documents for more information.
+
+    Example::
+
+        model = torch.nn.Linear(D_in, D_out).cuda().half()
+        optimizer = apex.optimizers.FusedAdam(model.parameters())
+        # Name the FP16_Optimizer instance to replace the existing optimizer
+        # (recommended but not required):
+        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+        ...
+        # loss.backward() becomes:
+        optimizer.backward(loss)
+        ...
+
+    Example with dynamic loss scaling::
+
+        ...
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
+                                   # optional arg to control dynamic loss scaling behavior
+                                   # dynamic_loss_args={'scale_window' : 500})
+                                   # Usually, dynamic_loss_args is not necessary.
+    """
+
+    def __init__(self,
+                 init_optimizer,
+                 static_loss_scale=1.0,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=True):
+
+        # The fused optimizer does all the work. We need this layer for two reason:
+        # 1. maintain same user API from apex.fp16_utils
+        # 2. keep common stuff here in case we need to add new fused optimizer later
+
+        # differences from apex.fp16_utils:
+        # - assume all model params in fp16
+        # - assume all params requires grad
+        # - flat by groups, not keeping state. TODO: remove state explicitly?
+        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
+        if not torch.cuda.is_available:
+            raise SystemError("Cannot use fp16 without CUDA.")
+        self.optimizer = init_optimizer
+
+        # param flattened by groups
+        self.fp16_groups = []
+        self.fp16_groups_flat = []
+        self.fp32_groups_flat = []
+
+        # loop to deal with groups
+        for i, param_group in enumerate(self.optimizer.param_groups):
+            # push this group to list before modify
+            self.fp16_groups.append(param_group['params'])
+            # init fp16 weight buffer, flattened
+            self.fp16_groups_flat.append(_flatten_dense_tensors([p.clone().detach() for p in self.fp16_groups[i]]))
+            # set model fp16 weight to slices of flattened buffer
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
+            for p,q in zip(self.fp16_groups[i], updated_params):
+                p.data = q.data
+            # init master weight, flattened
+            self.fp32_groups_flat.append(self.fp16_groups_flat[i].clone().float().detach())
+            # modify optimizer of have flat master weight
+            self.fp32_groups_flat[i].requires_grad = True # keep this in case internal optimizer uses it
+            param_group['params'] = [self.fp32_groups_flat[i]]
+
+        # we may have a way of fusing dynamic scale. Do not support for now
+        if dynamic_loss_scale:
+            if dynamic_loss_args is not None:
+                raise SystemError("Do not support dynamic loss scale args for now.")
+            self.dynamic_loss_scale = True
+            self.cur_scale = 2**16
+            self.cur_iter = 0
+            self.last_overflow_iter = -1
+            self.scale_factor = 2
+            self.scale_window = 1000
+        else:
+            self.dynamic_loss_scale = False
+            self.cur_iter = 0
+            self.cur_scale = static_loss_scale
+
+    def zero_grad(self, set_grads_to_None=True):
+        """
+        Zero FP16 parameter grads.
+        """
+        # FP32 grad should never exist.
+        # For speed, set model fp16 grad to None by default
+        for group in self.fp16_groups:
+            for p in group:
+                if set_grads_to_None:
+                    p.grad = None
+                else:
+                    if p.grad is not None:
+                        p.grad.detach_()
+                        p.grad.zero_()
+
+    def _compute_grad_norm(self, fp16_grads_flat, norm_type=2):
+        """
+        Compute fp16 grad norm for later clipping(fused with update).
+        Internal accumulated in fp32.
+        Also fused in NaN check. Possibly other reduction needed for grad.
+
+        Args:
+            fp16_grads_flat (tensor): fp16 grad flattened
+            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
+                infinity norm.
+
+        Returns:
+            Total norm of the current fp16 gradients (viewed as a single vector).
+            Returns -1 if the most recently computed fp16 gradients overflowed
+        """
+        # TODO: Not most efficient with copy to cpu and sync
+        # only support 2-norm now
+        # for torch version <= 1.0.1, torch.norm with dtype will fail and fall back to cast
+        try:
+            norm = float(torch.norm(fp16_grads_flat, 2.0, dtype=torch.float32))
+        except TypeError as err:
+            norm = float(torch.norm(fp16_grads_flat.float(), 2.0))
+        if norm == float('inf') or norm == -float('inf') or norm != norm:
+            return -1
+        else:
+            return norm
+
+    def step(self, closure=None):
+        """
+        Not supporting closure.
+        """
+        # First compute norm for all group so we know if there is overflow
+        grads_groups_flat = []
+        norm_groups = []
+        skip = False
+        for i, group in enumerate(self.fp16_groups):
+            grads_groups_flat.append(_flatten_dense_tensors([p.grad for p in group]))
+            norm_groups.append(self._compute_grad_norm(grads_groups_flat[i]))
+            if norm_groups[i] == -1: #TODO: early break
+                skip = True
+
+        if skip:
+            self._update_scale(skip)
+            return
+
+        # norm is in fact norm*cur_scale
+        self.optimizer.step(grads=[[g] for g in grads_groups_flat],
+                            output_params=[[p] for p in self.fp16_groups_flat],
+                            scale=self.cur_scale,
+                            grad_norms=norm_groups)
+
+        # TODO: we probably don't need this? just to be safe
+        for i in range(len(norm_groups)):
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
+            for p,q in zip(self.fp16_groups[i], updated_params):
+                p.data = q.data
+
+        self._update_scale(False)
+        return
+
+    def backward(self, loss):
+        """
+        :attr:`backward` performs the following steps:
+
+        1. fp32_loss = loss.float()
+        2. scaled_loss = fp32_loss*loss_scale
+        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
+        """
+        scaled_loss = (loss.float()) * self.cur_scale
+        scaled_loss.backward()
+
+    def _update_scale(self, skip):
+        if self.dynamic_loss_scale:
+            if skip:
+                print("\nGrad overflow on iteration", self.cur_iter)
+                print("Using dynamic loss scale of", self.cur_scale)
+                self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
+                self.last_overflow_iter = self.cur_iter
+            else:
+                if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
+                    self.cur_scale *= self.scale_factor
+        else:
+            if skip:
+                print("\nGrad overflow on iteration", self.cur_iter)
+                print("Using static loss scale of", self.cur_scale)
+        self.cur_iter +=1
+        return
+
+    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
+    def _get_state(self):
+        return self.optimizer.state
+
+    def _set_state(self, value):
+        self.optimizer.state = value
+
+    state = property(_get_state, _set_state)
+
+    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
+    # (for example, to adjust the learning rate)
+    def _get_param_groups(self):
+        return self.optimizer.param_groups
+
+    def _set_param_groups(self, value):
+        self.optimizer.param_groups = value
+
+    param_groups = property(_get_param_groups, _set_param_groups)
+
+    def state_dict(self):
+        """
+        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
+        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
+        of the contained Pytorch optimizer.
+        Example::
+            checkpoint = {}
+            checkpoint['model'] = model.state_dict()
+            checkpoint['optimizer'] = optimizer.state_dict()
+            torch.save(checkpoint, "saved.pth")
+        """
+        state_dict = {}
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['cur_scale'] = self.cur_scale
+        state_dict['cur_iter'] = self.cur_iter
+        if state_dict['dynamic_loss_scale']:
+            state_dict['last_overflow_iter'] = self.last_overflow_iter
+            state_dict['scale_factor'] = self.scale_factor
+            state_dict['scale_window'] = self.scale_window
+        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
+        state_dict['fp32_groups_flat'] = self.fp32_groups_flat
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        """
+        Loads a state_dict created by an earlier call to state_dict().
+        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
+        whose parameters in turn came from ``model``, it is expected that the user
+        will call ``model.load_state_dict()`` before
+        ``fp16_optimizer_instance.load_state_dict()`` is called.
+        Example::
+            model = torch.nn.Linear(D_in, D_out).cuda().half()
+            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
+            ...
+            checkpoint = torch.load("saved.pth")
+            model.load_state_dict(checkpoint['model'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+        """
+        # I think it should actually be ok to reload the optimizer before the model.
+        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
+        self.cur_scale = state_dict['cur_scale']
+        self.cur_iter = state_dict['cur_iter']
+        if state_dict['dynamic_loss_scale']:
+            self.last_overflow_iter = state_dict['last_overflow_iter']
+            self.scale_factor = state_dict['scale_factor']
+            self.scale_window = state_dict['scale_window']
+        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
+        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
+        # The optimizer's hyperparameters and internal buffers are also up to date.
+        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
+        # out of date.  There are two options.
+        # 1:  Refresh the master params from the model's fp16 params.
+        # This requires less storage but incurs precision loss.
+        # 2:  Save and restore the fp32 master copies separately.
+        # We choose option 2.
+        #
+        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
+        # of their associated parameters, because it's possible those buffers might not exist yet in
+        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
+        # constructed in the same way as the one whose state_dict we are loading, the same master params
+        # are guaranteed to exist, so we can just copy_() from the saved master params.
+        for current, saved in zip(self.fp32_groups_flat, state_dict['fp32_groups_flat']):
+            current.data.copy_(saved.data)
--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -6,6 +6,7 @@ from collections import OrderedDict
 from itertools import chain
 import copy
 import importlib
+from ..multi_tensor_apply import multi_tensor_applier

 imported_flatten_impl = False

@@ -226,7 +227,13 @@ class DistributedDataParallel(Module):
        self.param_type_to_tmp_i = {"torch.cuda.HalfTensor" : 0, 
                                    "torch.cuda.FloatTensor" : 1,
                                    "torch.cuda.DoubleTensor" : 2}
-                 
+
+        if multi_tensor_applier.available:
+            # TODO:  I really need to centralize the C++ backed imports
+            import amp_C
+            self.multi_tensor_scale = amp_C.multi_tensor_scale
+            self._overflow_buf = torch.cuda.IntTensor([0])
+
        self.create_hooks()

        flat_dist_call([param.data for param in self.module.parameters()], dist.broadcast, (0,) )
@@ -396,8 +403,15 @@ class DistributedDataParallel(Module):
                                   "allreduce buffer.  This is almost certainly an error.")
            self.allreduce_buffers[bucket_idx] = allreduced
        else:
-            for buf, synced in zip(bucket, unflatten(allreduced, bucket)):
-                buf.copy_(synced)
+            if multi_tensor_applier.available:
+                multi_tensor_applier(
+                    self.multi_tensor_scale,
+                    self._overflow_buf,
+                    [unflatten(allreduced, bucket), bucket],
+                    1.0)
+            else:
+                for buf, synced in zip(bucket, unflatten(allreduced, bucket)):
+                    buf.copy_(synced)


    def allreduce_fallback(self):

--- a/apex/parallel/optimized_sync_batchnorm.py
+++ b/apex/parallel/optimized_sync_batchnorm.py
@@ -67,7 +67,10 @@ class SyncBatchNorm(_BatchNorm):
        self.channel_last = channel_last

    def forward(self, input):
-        if not self.training and self.track_running_stats and not self.channel_last:
+        # if input.dim() == 2, we switch to channel_last for efficient memory accessing
+        channel_last = self.channel_last if input.dim() != 2 else True
+
+        if not self.training and self.track_running_stats and not channel_last:
            # fall back to pytorch implementation for inference
            return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
        else:
@@ -78,4 +81,4 @@ class SyncBatchNorm(_BatchNorm):
                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
                else:
                    exponential_average_factor = self.momentum
-            return SyncBatchnormFunction.apply(input, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, self.channel_last)
+            return SyncBatchnormFunction.apply(input, self.weight, self.bias, self.running_mean, self.running_var, self.eps, self.training or not self.track_running_stats, exponential_average_factor, self.process_group, channel_last)
--- a/apex/parallel/optimized_sync_batchnorm_kernel.py
+++ b/apex/parallel/optimized_sync_batchnorm_kernel.py
@@ -22,10 +22,13 @@ class SyncBatchnormFunction(Function):
            if channel_last:
                count = int(input.numel()/input.size(-1))
                mean, var_biased = syncbn.welford_mean_var_c_last(input)
-            else :
+            else:
                count = int(input.numel()/input.size(1))
                mean, var_biased = syncbn.welford_mean_var(input)

+            if count == 1:
+                raise ValueError('Expected more than 1 value per channel when training, got input size{}'.format(input.size()))
+
            if torch.distributed.is_initialized():
                if not process_group:
                    process_group = torch.distributed.group.WORLD
@@ -48,7 +51,7 @@ class SyncBatchnormFunction(Function):
            running_variance.data = running_variance.data * (1-momentum) + momentum*r_v_inc
        else:
            mean = running_mean.data
-            inv_std = 1.0 / torch.sqrt(running_var.data + eps)
+            inv_std = 1.0 / torch.sqrt(running_variance.data + eps)

        ctx.save_for_backward(input, weight, mean, inv_std)
        ctx.process_group = process_group

--- a/apex/parallel/sync_batchnorm.py
+++ b/apex/parallel/sync_batchnorm.py
@@ -72,10 +72,9 @@ class SyncBatchNorm(_BatchNorm):
            return F.batch_norm(input, self.running_mean, self.running_var, self.weight, self.bias, False, 0.0, self.eps)
        else:
            process_group = self.process_group
-            world_size = 0
+            world_size = 1
            if not self.process_group:
                process_group = torch.distributed.group.WORLD
-            world_size = torch.distributed.get_world_size(process_group)
            self.num_batches_tracked += 1
            with torch.no_grad():
                channel_first_input = input.transpose(0, 1).contiguous()
@@ -88,6 +87,7 @@ class SyncBatchNorm(_BatchNorm):
                local_sqr_mean = torch.pow(
                    squashed_input_tensor_view, 2).mean(1)
                if torch.distributed.is_initialized():
+                    world_size = torch.distributed.get_world_size(process_group)
                    torch.distributed.all_reduce(
                        local_mean, ReduceOp.SUM, process_group)
                    mean = local_mean / world_size

--- a/csrc/amp_C_frontend.cpp
+++ b/csrc/amp_C_frontend.cpp
@@ -18,37 +18,26 @@ void multi_tensor_sgd_cuda(
  bool first_run,
  bool wd_after_momentum);

-void scale_check_overflow_cuda(
-  const at::Tensor& grads,
-  float scale,
-  const at::Tensor& d_buf,
-  const at::Tensor& downscaled_grads);
-
-void scale_check_overflow(
-  at::Tensor grads,
-  float scale,
-  at::Tensor overflow_buf,
-  at::Tensor downscaled_grads)
-  // const at::optional<at::Tensor> downscaled_grads)
-{ 
-  AT_CHECK(grads.type().is_cuda(), "grads must be a CUDA tensor");
-  AT_CHECK(grads.is_contiguous(), "grads must be contiguous");
-  AT_CHECK(overflow_buf.type().is_cuda(), "overflow_buf must be a CUDA tensor");
-  AT_CHECK(overflow_buf.is_contiguous(), "overflow_buf must be contiguous");
-  AT_CHECK(downscaled_grads.type().is_cuda(), "downscaled_grads must be a CUDA tensor");
-  AT_CHECK(downscaled_grads.is_contiguous(), "downscaled_grads must be contiguous");
-  // Make sure we are downscaling the FP32 master grads
-  AT_CHECK(downscaled_grads.type().scalarType() == at::ScalarType::Float,
-    "The output grads supplied to scale_check_overflow should be fp32 (master grads).")
-  AT_CHECK(grads.numel() == downscaled_grads.numel(), "Input and output grads must be the same size.");
+void multi_tensor_axpby_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  float a,
+  float b,
+  int arg_to_check);

-  scale_check_overflow_cuda(grads, scale, overflow_buf, downscaled_grads);
-}
+at::Tensor multi_tensor_l2norm_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists);

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("scale_check_overflow", &scale_check_overflow, "Fused overflow check + scale for FP32 tensors");
  m.def("multi_tensor_scale", &multi_tensor_scale_cuda,
        "Fused overflow check + scale for a list of contiguous tensors");
  m.def("multi_tensor_sgd", &multi_tensor_sgd_cuda,
        "Fused SGD optimizer for list of contiguous tensors");
+  m.def("multi_tensor_axpby", &multi_tensor_axpby_cuda,
+        "out = a*x + b*y for a list of contiguous tensors");
+  m.def("multi_tensor_l2norm", &multi_tensor_l2norm_cuda,
+        "Computes L2 norm for a list of contiguous tensors");
 }