moved module to model and removed fp16

b84d7a90 · mohammad · 97ba5c0e · 97ba5c0e · 97ba5c0e · 97ba5c0e
Commit b84d7a90 authored Dec 26, 2020 by mohammad
17 changed files
--- a/megatron/fp16/__init__.py
+++ b/megatron/fp16/__init__.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .fp16util import (
-    BN_convert_float,
-    network_to_half,
-    prep_param_lists,
-    model_grads_to_master_grads,
-    master_params_to_model_params,
-    tofp16,
-    to_python_float,
-    clip_grad_norm,
-    convert_module,
-    convert_network,
-    FP16Model,
-)
-from .fp16 import *
-from .loss_scaler import *
--- a/megatron/fp16/fp16.py
+++ b/megatron/fp16/fp16.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Stable version of apex FP16 Optimizer"""
-import torch
-from torch import nn
-from torch.autograd import Variable
-from torch.nn.parameter import Parameter
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from .loss_scaler import DynamicLossScaler, LossScaler
-from .fp16util import model_grads_to_master_grads, master_params_to_model_params, clip_grad_norm
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-from megatron.module import MegatronModule
-from megatron import mpu
-FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
-HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
-def conversion_helper(val, conversion):
-    """Apply conversion to val. Recursively apply conversion if `val` is a nested tuple/list structure."""
-    if not isinstance(val, (tuple, list)):
-        return conversion(val)
-    rtn = [conversion_helper(v, conversion) for v in val]
-    if isinstance(val, tuple):
-        rtn = tuple(rtn)
-    return rtn
-def fp32_to_fp16(val):
-    """Convert fp32 `val` to fp16"""
-    def half_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if isinstance(val_typecheck, FLOAT_TYPES):
-            val = val.half()
-        return val
-    return conversion_helper(val, half_conversion)
-def fp16_to_fp32(val):
-    """Convert fp16 `val` to fp32"""
-    def float_conversion(val):
-        val_typecheck = val
-        if isinstance(val_typecheck, (Parameter, Variable)):
-            val_typecheck = val.data
-        if isinstance(val_typecheck, HALF_TYPES):
-            val = val.float()
-        return val
-    return conversion_helper(val, float_conversion)
-class FP16_Module(MegatronModule):
-    def __init__(self, module):
-        super(FP16_Module, self).__init__()
-        self.add_module('module', module.half())
-    def forward(self, *inputs, **kwargs):
-        if mpu.is_pipeline_first_stage():
-            inputs = fp32_to_fp16(inputs)
-        outputs = self.module(*inputs, **kwargs)
-        if mpu.is_pipeline_last_stage():
-            outputs = fp16_to_fp32(outputs)
-        return outputs
-    def state_dict(self, destination=None, prefix='', keep_vars=False):
-        return self.module.state_dict(destination, prefix, keep_vars)
-    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
-                                       keep_vars=False):
-        return self.module.state_dict_for_save_checkpoint(destination, prefix,
-                                                          keep_vars)
-    def load_state_dict(self, state_dict, strict=True):
-        self.module.load_state_dict(state_dict, strict=strict)
-# TODO:  Update overflow check + downscale to use Carl's fused kernel.
-class FP16_Optimizer(object):
-    """
-    :class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
-    and manage static or dynamic loss scaling and master weights in a manner transparent to the user.
-    For standard use, only two lines must be changed:  creating the :class:`FP16_Optimizer` instance,
-    and changing the call to ``backward``.
-    Example::
-        model = torch.nn.Linear(D_in, D_out).cuda().half()
-        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-        # Name the FP16_Optimizer instance to replace the existing optimizer
-        # (recommended but not required):
-        optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-        ...
-        # loss.backward() becomes:
-        optimizer.backward(loss)
-        ...
-    Example with dynamic loss scaling::
-        ...
-        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-                                   # optional arg to control dynamic loss scaling behavior
-                                   # dynamic_loss_args={'scale_window' : 500})
-                                   # Usually, dynamic_loss_args is not necessary.
-    Args:
-        init_optimizer (torch.optim.optimizer):  Existing optimizer created with the parameters to optimize.  Internally, :class:`FP16_Optimizer` replaces the passed optimizer's fp16 parameters, if any, with fp32 master parameters copied from the original ones.  :class:`FP16_Optimizer` also stores references to the original fp16 parameters, and updates these fp16 parameters from the master fp32 copy at the end of each :attr:`step`.
-        static_loss_scale (float, optional, default=1.0):  Loss scale used internally to scale gradients computed by the model.  Any fp16 gradients will be copied to fp32, then downscaled before being applied to the fp32 master params, so ``static_loss_scale`` should not affect learning rate.
-        dynamic_loss_scale (bool, optional, default=False):  Use dynamic loss scaling.  If True, this will override any ``static_loss_scale`` option.
-        dynamic_loss_args (dict, optional, default=None):  Dict of kwargs that will be forwarded to the internal :class:`DynamicLossScaler` instance's constructor.  Keys of this dict must match kwargs accepted by :class:`DynamicLossScaler`'s constructor.  If ``dynamic_loss_args`` is unspecified, :class:`DynamicLossScaler`'s defaults will be used.
-        verbose (bool, optional, default=True):  By default, FP16_Optimizer's constructor prints out the parameters and parameter groups it is ingesting, as a sanity check.  If this becomes annoying (e.g. for large models), it can be disabled by passing ``verbose=False``.  ``verbose=False`` will not disable printing when the loss scale is readjusted during dynamic loss scaling.
-    ``init_optimizer`` is expected to have been constructed in the ordinary way.
-    It is recommended (although not required) that the newly constructed :class:`FP16_Optimizer` instance be
-    named to replace ``init_optimizer``, for two reasons:
-    First, it means that references to the same name
-    later in the file will not have to change.
-    Second, :class:`FP16_Optimizer` reserves the right (as an implementation detail) to
-    modify ``init_optimizer``.  If you do choose a unique name for the new
-    :class:`FP16_Optimizer` instance, you should only work with this new instance,
-    because the preexisting optimizer might no longer behave as expected.
-    ``init_optimizer`` may be any Pytorch optimizer.
-    It may contain a mixture of fp16 and fp32 parameters organized into any number of
-    ``param_groups`` with different hyperparameters.  The :class:`FP16_Optimizer` constructor will
-    ingest these ``param_groups`` and remember them.
-    Calls to ::
-        loss.backward()
-    must be replaced with ::
-        optimizer.backward(loss)
-    because :class:`FP16_Optimizer` requires ownership of the backward pass to implement
-    loss scaling and copies to master gradients.
-    .. note::
-        Loss scaling, either static or dynamic, is orthogonal to learning rate, because gradients
-        are downscaled before being applied.  This means that adjusting the loss scale, or using
-        dynamic loss scaling, should not require retuning the learning rate or any other
-        hyperparameters.
-    **Advanced options**
-    **Closures**:  :class:`FP16_Optimizer` can wrap a Pytorch optimizer that receives a closure.
-    See docstring for :attr:`step`.
-    **Gradient clipping**:  Use :attr:`clip_master_grads`.
-    **Multiple losses**:  If your model accumulates gradients from multiple losses,
-    this can be made more efficient by supplying ``update_master_grads=False``
-    to :attr:`backward`.  See docstring for :attr:`backward`.
-    **Manually adjusting loss scale**:  The current loss scale can be retrieved or set via ::
-        print(optimizer.loss_scale)
-        optimizer.loss_scale = new_loss_scale
-    For static loss scaling, manually adjusting the loss scale over time is a reasonable
-    thing to do.  During later epochs, gradients may become smaller, and a
-    higher loss scale may be required, analogous to scheduling the learning rate.  Dynamic loss
-    scaling is more subtle (see :class:`DynamicLossScaler`) and in this case, manually adjusting
-    the loss scale is not recommended.
-    **Multi_GPU training**:  If the wrapped ``init_optimizer`` was created from a model wrapped in
-    Pytorch DistributedDataParallel or Apex DistributedDataParallel, :class:`FP16_Optimizer`
-    should still work as intended.
-    """
-    def __init__(self,
-                 init_optimizer,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=False):
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-        self.verbose = verbose
-        self.optimizer = init_optimizer
-        # init_state_dict sets up an alternative way to cast per-param state tensors.
-        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
-        # init_state_dict = init_optimizer.state_dict()
-        self.fp16_groups = []
-        self.fp32_from_fp16_groups = []
-        self.fp32_from_fp32_groups = []
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
-            fp16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_fp16_params_this_group = []
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-                    if param.type() == 'torch.cuda.HalfTensor':
-                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
-                                         .format(param.size()))
-                        fp16_params_this_group.append(param)
-                        master_param = param.detach().clone().float()
-                        master_param.requires_grad = True
-                        # Copythe model parallel flag.
-                        master_param.tensor_model_parallel = param.tensor_model_parallel
-                        param_group['params'][i] = master_param
-                        fp32_from_fp16_params_this_group.append(master_param)
-                        # Reset existing state dict key to the new master param.
-                        # We still need to recast per-param state tensors, if any, to FP32.
-                        if param in self.optimizer.state:
-                            self.optimizer.state[master_param] = self.optimizer.state.pop(param)
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
-                                         .format(param.size()))
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-                    else:
-                        raise TypeError("Wrapped parameters must be either "
-                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
-                                        "Received {}".format(param.type()))
-            self.fp16_groups.append(fp16_params_this_group)
-            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
-            self.fp32_from_fp32_groups.append(fp32_params_this_group)
-        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
-        self.optimizer.load_state_dict(self.optimizer.state_dict())
-        # alternative way to cast per-param state tensors:
-        # self.optimizer.load_state_dict(init_state_dict)
-        if dynamic_loss_scale:
-            self.dynamic_loss_scale = True
-            if dynamic_loss_args is not None:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-            else:
-                self.loss_scaler = DynamicLossScaler()
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(static_loss_scale)
-        self.overflow = False
-        self.first_closure_call_this_step = True
-        self.clip_grad_norm = clip_grad_norm
-    def maybe_print(self, msg):
-        if self.verbose:
-            print(msg)
-    def __getstate__(self):
-        raise RuntimeError("FP16_Optimizer should be serialized using state_dict().")
-    def __setstate__(self, state):
-        raise RuntimeError("FP16_Optimizer should be deserialized using load_state_dict().")
-    def zero_grad(self, set_grads_to_None=False):
-        """
-        Zero fp32 and fp16 parameter grads.
-        """
-        # In principle, only the .grad attributes of the model params need to be zeroed,
-        # because gradients are copied into the FP32 master params.  However, we zero
-        # all gradients owned by the optimizer, just to be safe:
-        for group in self.optimizer.param_groups:
-            for p in group['params']:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-        # Zero fp16 gradients owned by the model:
-        for fp16_group in self.fp16_groups:
-            for param in fp16_group:
-                if set_grads_to_None:
-                    param.grad = None
-                else:
-                    if param.grad is not None:
-                        param.grad.detach_()  # as in torch.optim.optimizer.zero_grad()
-                        param.grad.zero_()
-    def _check_overflow(self):
-        params = []
-        for group in self.fp16_groups:
-            for param in group:
-                params.append(param)
-        for group in self.fp32_from_fp32_groups:
-            for param in group:
-                params.append(param)
-        self.overflow = self.loss_scaler.has_overflow(params)
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-    def _master_params_to_model_params(self):
-        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
-            master_params_to_model_params(fp16_group, fp32_from_fp16_group)
-    def _model_params_to_master_params(self):
-        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
-            master_params_to_model_params(fp32_from_fp16_group, fp16_group)
-    # To consider:  Integrate distributed with this wrapper by registering a hook on each variable
-    # that does the overflow check, gradient copy + downscale, and fp32
-    # allreduce in a different stream.
-    def _model_grads_to_master_grads(self):
-        for fp16_group, fp32_from_fp16_group in zip(self.fp16_groups, self.fp32_from_fp16_groups):
-            model_grads_to_master_grads(fp16_group, fp32_from_fp16_group)
-    def _downscale_master(self):
-        if self.loss_scale != 1.0:
-            for group in self.optimizer.param_groups:
-                grads = [p.grad for p in group['params'] if p.grad is not None]
-                _overflow_buf = torch.cuda.IntTensor([0])
-                multi_tensor_applier(amp_C.multi_tensor_scale,
-                                     _overflow_buf,
-                                     [grads, grads],
-                                     1./self.loss_scale)
-    def clip_master_grads(self, max_norm, norm_type=2):
-        """
-        Clips fp32 master gradients via ``torch.nn.utils.clip_grad_norm``.
-        Args:
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-        Returns:
-            Total norm of the current fp32 gradients (viewed as a single vector).
-        .. warning::
-            Returns -1 if the most recently computed fp16 gradients overflowed (that is, if ``self.overflow`` is ``True``).
-        """
-        if not self.overflow:
-            fp32_params = []
-            for param_group in self.optimizer.param_groups:
-                for param in param_group['params']:
-                    fp32_params.append(param)
-            return self.clip_grad_norm(fp32_params, max_norm, norm_type)
-        else:
-            return -1
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['first_closure_call_this_step'] = self.first_closure_call_this_step
-        state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
-        state_dict['fp32_from_fp16'] = self.fp32_from_fp16_groups
-        return state_dict
-    def load_state_dict(self, state_dict):
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-        Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict['loss_scaler']
-        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
-        self.overflow = state_dict['overflow']
-        self.first_closure_call_this_step = state_dict['first_closure_call_this_step']
-        self.optimizer.load_state_dict(state_dict['optimizer_state_dict'])
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 2.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-        for current_group, saved_group in zip(
-                self.fp32_from_fp16_groups, state_dict['fp32_from_fp16']):
-            for current, saved in zip(current_group, saved_group):
-                current.data.copy_(saved.data)
-    def step(self, closure=None):  # could add clip option.
-        """
-        If no closure is supplied, :attr:`step` should be called after
-        ``fp16_optimizer_obj.backward(loss)``.
-        :attr:`step` updates the fp32 master copy of parameters using the optimizer supplied to
-        :class:`FP16_Optimizer`'s constructor, then copies the updated fp32 params into the fp16 params
-        originally referenced by :class:`FP16_Optimizer`'s constructor, so the user may immediately run
-        another forward pass using their model.
-        If a closure is supplied, :attr:`step` may be called without a prior call to
-        :attr:`backward(loss)`.
-        This control flow is identical to `ordinary Pytorch optimizer use`_ with closures.
-        However, the user should take care that any ``loss.backward()`` call within the closure
-        has been replaced by ``fp16_optimizer_obj.backward(loss)``.
-        Args:
-           closure (optional):  Closure that will be supplied to the underlying optimizer originally passed to :class:`FP16_Optimizer`'s constructor.  closure should call :attr:`zero_grad()` on the :class:`FP16_Optimizer` object, compute the loss, call :attr:`backward(loss)`, and return the loss.
-        Example with closure::
-            # optimizer is assumed to be an FP16_Optimizer object, previously constructed from an
-            # existing pytorch optimizer.
-            for input, target in dataset:
-                def closure():
-                    optimizer.zero_grad()
-                    output = model(input)
-                    loss = loss_fn(output, target)
-                    # loss.backward() becomes:
-                    optimizer.backward(loss)
-                    return loss
-                optimizer.step(closure)
-        .. warning::
-            Currently, calling :attr:`step` with a closure is not compatible with dynamic loss scaling.
-        .. _`ordinary Pytorch optimizer use`:
-            http://pytorch.org/docs/master/optim.html#optimizer-step-closure
-        """
-        scale = self.loss_scaler.loss_scale
-        self._update_scale(self.overflow)
-        if self.overflow:
-            self.maybe_print("OVERFLOW! Skipping step. Attempted loss scale: {}, reducing to {}"
-                             .format(scale, self.loss_scale))
-            return
-        if closure is not None:
-            retval = self._step_with_closure(closure)
-        else:
-            retval = self.optimizer.step()
-        self._master_params_to_model_params()
-        return retval
-    def _step_with_closure(self, closure):
-        def wrapped_closure():
-            # helpful for debugging
-            # print("Calling wrapped_closure, first_closure_call_this_step = {}"
-            #       .format(self.first_closure_call_this_step))
-            if self.first_closure_call_this_step:
-                # We expect that the fp16 params are initially fresh on entering self.step(),
-                # so _master_params_to_model_params() is unnecessary the first time wrapped_closure()
-                # is called within self.optimizer.step().
-                self.first_closure_call_this_step = False
-            else:
-                # If self.optimizer.step() internally calls wrapped_closure more than once,
-                # it may update the fp32 params after each call.  However, self.optimizer
-                # doesn't know about the fp16 params at all.  If the fp32 params get updated,
-                # we can't rely on self.optimizer to refresh the fp16 params.  We need
-                # to handle that manually:
-                self._master_params_to_model_params()
-            # Our API expects the user to give us ownership of the backward() call by
-            # replacing all calls to loss.backward() with optimizer.backward(loss).
-            # This requirement holds whether or not the call to backward() is made within a closure.
-            # If the user is properly calling optimizer.backward(loss) within "closure,"
-            # calling closure() here will give the fp32 master params fresh gradients
-            # for the optimizer to play with, so all wrapped_closure needs to do is call
-            # closure() and return the loss.
-            temp_loss = closure()
-            while(self.overflow):
-                scale = self.loss_scaler.loss_scale
-                self._update_scale(self.overflow)
-                self.maybe_print("OVERFLOW within closure! Skipping step. Attempted loss scale: {}, "
-                                 "reducing to {}".format(scale, self.loss_scale))
-                temp_loss = closure()
-            return temp_loss
-        retval = self.optimizer.step(wrapped_closure)
-        self.first_closure_call_this_step = True
-        return retval
-    def backward(self, output_tensor, update_master_grads=True, retain_graph=False,
-                 output_tensor_grad=None):
-        """
-        :attr:`backward` performs the following conceptual steps:
-        1. fp32_loss = loss.float() (see first Note below)
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's leaves (which may be fp16, fp32, or a mixture, depending how your model was defined).
-        4. fp16 grads are then copied to the master params' ``.grad`` attributes (see second Note), which are guaranteed to be fp32.
-        5. Finally, master grads are divided by loss_scale.
-        In this way, after :attr:`backward`, the master params have fresh gradients,
-        and :attr:`step` may be called.
-        .. note::
-            :attr:`backward` internally converts the loss to fp32 before applying the loss scale.
-            This provides some additional safety against overflow if the user has supplied an
-            fp16 loss value.
-            However, for maximum overflow safety, the user should
-            compute the loss criterion (MSE, cross entropy, etc) in fp32 before supplying it to
-            :attr:`backward`.
-        .. warning::
-            The gradients found in a model's leaves after the call to
-            :attr:`backward` should not be regarded as valid in general,
-            because it's possible
-            they have been scaled (and in the case of dynamic loss scaling,
-            the scale factor may change over time).
-            If the user wants to inspect gradients after a call to :attr:`backward`,
-            only the master gradients should be regarded as valid.  These can be retrieved via
-            :attr:`inspect_master_grad_data()`.
-        Args:
-            loss:  The loss output by the user's model.  loss may be either float or half (but see first Note above).
-            update_master_grads (bool, optional, default=True):  Option to copy fp16 grads to fp32 grads on this call.  By setting this to False, the user can delay the copy, which is useful to eliminate redundant fp16->fp32 grad copies if :attr:`backward` is being called on multiple losses in one iteration.  If set to False, the user becomes responsible for calling :attr:`update_master_grads` before calling :attr:`step`.
-            retain_graph (bool, optional, default=False):  Forwards the usual ``retain_graph=True`` option to the internal call to ``loss.backward``.  If ``retain_graph`` is being used to accumulate gradient values from multiple backward passes before calling ``optimizer.step``, passing ``update_master_grads=False`` is also recommended (see Example below).
-        Example::
-            # Ordinary operation:
-            optimizer.backward(loss)
-            # Naive operation with multiple losses (technically valid, but less efficient):
-            # fp32 grads will be correct after the second call,  but
-            # the first call incurs an unnecessary fp16->fp32 grad copy.
-            optimizer.backward(loss1)
-            optimizer.backward(loss2)
-            # More efficient way to handle multiple losses:
-            # The fp16->fp32 grad copy is delayed until fp16 grads from all
-            # losses have been accumulated.
-            optimizer.backward(loss1, update_master_grads=False)
-            optimizer.backward(loss2, update_master_grads=False)
-            optimizer.update_master_grads()
-        """
-        # To consider:  try multiple backward passes using retain_grad=True to find
-        # a loss scale that works.  After you find a loss scale that works, do a final dummy
-        # backward pass with retain_graph=False to tear down the graph.  Doing this would avoid
-        # discarding the iteration,  but probably wouldn't improve overall efficiency.
-        self.loss_scaler.backward(output_tensor, retain_graph=retain_graph,
-                                  output_tensor_grad=output_tensor_grad)
-        if update_master_grads:
-            self.update_master_grads()
-    def update_master_grads(self):
-        """
-        Copy the ``.grad`` attribute from stored references to fp16 parameters to
-        the ``.grad`` attribute of the fp32 master parameters that are directly
-        updated by the optimizer.  :attr:`update_master_grads` only needs to be called if
-        ``fp16_optimizer_obj.backward`` was called with ``update_master_grads=False``.
-        """
-        if self.dynamic_loss_scale:
-            self._check_overflow()
-            if self.overflow:
-                return
-        self._model_grads_to_master_grads()
-        self._downscale_master()
-    def inspect_master_grad_data(self):
-        """
-        When running with :class:`FP16_Optimizer`,
-        ``.grad`` attributes of a model's fp16 leaves should not be
-        regarded as truthful, because they might be scaled.
-        After a call to :attr:`fp16_optimizer_obj.backward(loss)`, if no overflow was encountered,
-        the fp32 master params' ``.grad``
-        attributes will contain valid gradients properly divided by the loss scale.  However,
-        because :class:`FP16_Optimizer` flattens some parameters, accessing them may be
-        nonintuitive.  :attr:`inspect_master_grad_data`
-        allows those gradients to be viewed with shapes corresponding to their associated model leaves.
-        Returns:
-            List of lists (one list for each parameter group).  The list for each parameter group
-            is a list of the ``.grad.data`` attributes of the fp32 master params belonging to that group.
-        """
-        if self.overflow:
-            print("Warning:  calling FP16_Optimizer.inspect_master_grad_data while in an overflow state.  "
-                  "Gradients are currently invalid (may be inf, nan, or stale).  Returning None.")
-            return None
-        else:
-            # The optimizer owns only references to master params.
-            master_grads_data = []
-            for param_group in self.optimizer.param_groups:
-                master_grads_this_group = []
-                for param in param_group['params']:
-                    if param.grad is not None:
-                        master_grads_this_group.append(param.grad.data)
-                    else:
-                        master_grads_this_group.append(None)
-                master_grads_data.append(master_grads_this_group)
-            return master_grads_data
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-    def _set_state(self, value):
-        self.optimizer.state = value
-    state = property(_get_state, _set_state)
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-    param_groups = property(_get_param_groups, _set_param_groups)
--- a/megatron/fp16/fp16util.py
+++ b/megatron/fp16/fp16util.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-from megatron import mpu
-class tofp16(nn.Module):
-    """
-    Utility module that implements::
-        def forward(self, input):
-            return input.half()
-    """
-    def __init__(self):
-        super(tofp16, self).__init__()
-    def forward(self, input):
-        return input.half()
-def BN_convert_float(module):
-    """
-    Utility function for network_to_half().
-    Retained for legacy purposes.
-    """
-    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-        module.float()
-    for child in module.children():
-        BN_convert_float(child)
-    return module
-def network_to_half(network):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-    Retained for legacy purposes. It is recommended to use FP16Model.
-    """
-    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
-def convert_module(module, dtype):
-    """
-    Converts a module's immediate parameters and buffers to dtype.
-    """
-    for param in module.parameters(recurse=False):
-        if param is not None:
-            if param.data.dtype.is_floating_point:
-                param.data = param.data.to(dtype=dtype)
-            if param._grad is not None and param._grad.data.dtype.is_floating_point:
-                param._grad.data = param._grad.data.to(dtype=dtype)
-    for buf in module.buffers(recurse=False):
-        if buf is not None and buf.data.dtype.is_floating_point:
-            buf.data = buf.data.to(dtype=dtype)
-def convert_network(network, dtype):
-    """
-    Converts a network's parameters and buffers to dtype.
-    """
-    for module in network.modules():
-        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-            continue
-        convert_module(module, dtype)
-    return network
-class FP16Model(nn.Module):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-    """
-    def __init__(self, network):
-        super(FP16Model, self).__init__()
-        self.network = convert_network(network, dtype=torch.half)
-    def forward(self, *inputs):
-        inputs = tuple(t.half() for t in inputs)
-        return self.network(*inputs)
-def backwards_debug_hook(grad):
-    raise RuntimeError("master_params recieved a gradient in the backward pass!")
-def prep_param_lists(model, flat_master=False):
-    """
-    Creates a list of FP32 master parameters for a given model, as in
-    `Training Neural Networks with Mixed Precision:  Real Examples`_.
-    Args:
-        model (torch.nn.Module): Existing Pytorch model
-        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
-    Returns:
-        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
-    Example::
-        model_params, master_params = prep_param_lists(model)
-    .. warning::
-        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
-    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
-        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
-    """
-    model_params = [param for param in model.parameters() if param.requires_grad]
-    if flat_master:
-        # Give the user some more useful error messages
-        try:
-            # flatten_dense_tensors returns a contiguous flat array.
-            # http://pytorch.org/docs/master/_modules/torch/_utils.html
-            master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
-        except BaseException:
-            print("Error in prep_param_lists:  model may contain a mixture of parameters "
-                  "of different types.  Use flat_master=False, or use F16_Optimizer.")
-            raise
-        master_params = torch.nn.Parameter(master_params)
-        master_params.requires_grad = True
-        # master_params.register_hook(backwards_debug_hook)
-        if master_params.grad is None:
-            master_params.grad = master_params.new(*master_params.size())
-        return model_params, [master_params]
-    else:
-        master_params = [param.clone().float().detach() for param in model_params]
-        for param in master_params:
-            param.requires_grad = True
-        return model_params, master_params
-def model_grads_to_master_grads(model_params, master_params, flat_master=False):
-    """
-    Copy model gradients to master gradients.
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
-    """
-    if flat_master:
-        # The flattening may incur one more deep copy than is necessary.
-        master_params[0].grad.data.copy_(
-            _flatten_dense_tensors([p.grad.data for p in model_params]))
-    else:
-        for model, master in zip(model_params, master_params):
-            if model.grad is not None:
-                if master.grad is None:
-                    master.grad = Variable(master.data.new(*master.data.size()))
-            else:
-                master.grad = None
-        model_grads = [p.grad for p in model_params if p.grad is not None]
-        master_grads = [p.grad for p in master_params if p.grad is not None]
-        _overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             _overflow_buf,
-                             [model_grads, master_grads],
-                             1.0)
-def master_params_to_model_params(model_params, master_params, flat_master=False):
-    """
-    Copy master parameters to model parameters.
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
-    """
-    if flat_master:
-        for model, master in zip(model_params,
-                                 _unflatten_dense_tensors(master_params[0].data, model_params)):
-            model.data.copy_(master)
-    else:
-        for model, master in zip(model_params, master_params):
-            model.data.copy_(master.data)
-# Backward compatibility fixes
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-clip_grad_norm = None #mpu.clip_grad_norm
-# elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
-#    clip_grad_norm = torch.nn.utils.clip_grad_norm
-# else:
-#    clip_grad_norm = torch.nn.utils.clip_grad_norm_
--- a/megatron/fp16/loss_scaler.py
+++ b/megatron/fp16/loss_scaler.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-from megatron import mpu
-# item() is a recent addition, so this helps with backward compatibility.
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-class LossScaler:
-    """
-    Class that manages a static loss scale.  This class is intended to interact with
-    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
-    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
-    :class:`FP16_Optimizer`'s constructor.
-    Args:
-        scale (float, optional, default=1.0):  The loss scale.
-    """
-    def __init__(self, scale=1):
-        self.cur_scale = scale
-    # `params` is a list / generator of torch.Variable
-    def has_overflow(self, params):
-        return False
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        return False
-    def update_scale(self, overflow):
-        pass
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-    def scale_gradient(self, module, grad_in, grad_out):
-        _overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             _overflow_buf,
-                             [grad_in, grad_in],
-                             self.loss_scale)
-        return grad_in
-    def backward(self, output_tensor, retain_graph=False, output_tensor_grad=None):
-        # If output_tensor_grad is None, this is the last stage, and
-        # output_tensor is actually the loss and needs to be scaled.
-        # Otherwise, output_tensor does not need to be scaled again since
-        # output_tensor_grad is already scaled.
-        if output_tensor_grad is None:
-            scaled_output_tensor = output_tensor * self.loss_scale
-        else:
-            scaled_output_tensor = output_tensor
-        torch.autograd.backward(scaled_output_tensor, grad_tensors=output_tensor_grad,
-                                retain_graph=retain_graph)
-class DynamicLossScaler:
-    """
-    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
-    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
-    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
-    operates, because the default options can be changed using the
-    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
-    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
-    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
-    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
-    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
-    occurred.
-    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
-    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
-    If a certain number of iterations occur without overflowing gradients detected,
-    :class:`DynamicLossScaler` increases the loss scale once more.
-    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
-    always using the highest loss scale possible without incurring overflow.
-    Args:
-        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
-        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
-    """
-    def __init__(self,
-                 init_scale=2**32,
-                 scale_factor=2.,
-                 scale_window=1000,
-                 min_scale=1,
-                 delayed_shift=1,
-                 consecutive_hysteresis=False):
-        self.cur_scale = init_scale
-        self.cur_iter = 0
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-        self.min_scale = min_scale
-        self.delayed_shift = delayed_shift
-        self.cur_hysteresis = delayed_shift
-        self.consecutive_hysteresis = consecutive_hysteresis
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params):
-        for p in params:
-            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
-                return True
-        return False
-    def has_overflow(self, params):
-        overflow = self.has_overflow_serial(params)
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        overflow_gpu = torch.cuda.ByteTensor([overflow])
-        torch.distributed.all_reduce(overflow_gpu,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=mpu.get_model_parallel_group())
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-    # `overflow` is boolean indicating whether the gradient overflowed
-    def update_scale(self, overflow):
-        if not hasattr(self, 'min_scale'):
-            self.min_scale = 1
-        if not hasattr(self, 'delayed_shift'):
-            self.delayed_shift = 1
-        if not hasattr(self, 'cur_hysteresis'):
-            self.cur_hysteresis = 1
-        if not hasattr(self, 'consecutive_hysteresis'):
-            self.consecutive_hysteresis = True
-        if overflow:
-            # self.cur_scale /= self.scale_factor
-            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
-                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
-            else:
-                self.cur_hysteresis -= 1
-            self.last_overflow_iter = self.cur_iter
-        else:
-            if self.consecutive_hysteresis:
-                self.cur_hysteresis = self.delayed_shift
-            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
-                if not self.consecutive_hysteresis:
-                    self.cur_hysteresis = self.delayed_shift
-                self.cur_scale *= self.scale_factor
-        self.cur_iter += 1
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-    def scale_gradient(self, module, grad_in, grad_out):
-        _overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             _overflow_buf,
-                             [grad_in, grad_in],
-                             self.loss_scale)
-        return grad_in
-    def backward(self, output_tensor, retain_graph=False, output_tensor_grad=None):
-        # If output_tensor_grad is None, this is the last stage, and
-        # output_tensor is actually the loss and needs to be scaled.
-        # Otherwise, output_tensor does not need to be scaled again since
-        # output_tensor_grad is already scaled.
-        if output_tensor_grad is None:
-            scaled_output_tensor = output_tensor * self.loss_scale
-        else:
-            scaled_output_tensor = output_tensor
-        torch.autograd.backward(scaled_output_tensor, grad_tensors=output_tensor_grad,
-                                retain_graph=retain_graph)
-##############################################################
-# Example usage below here -- assuming it's in a separate file
-##############################################################
-"""
-TO-DO separate out into an example.
-if __name__ == "__main__":
-    import torch
-    from torch.autograd import Variable
-    from dynamic_loss_scaler import DynamicLossScaler
-    # N is batch size; D_in is input dimension;
-    # H is hidden dimension; D_out is output dimension.
-    N, D_in, H, D_out = 64, 1000, 100, 10
-    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-    x = Variable(torch.randn(N, D_in), requires_grad=False)
-    y = Variable(torch.randn(N, D_out), requires_grad=False)
-    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
-    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
-    parameters = [w1, w2]
-    learning_rate = 1e-6
-    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
-    loss_scaler = DynamicLossScaler()
-    for t in range(500):
-        y_pred = x.mm(w1).clamp(min=0).mm(w2)
-        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
-        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
-        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
-        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
-        # Run backprop
-        optimizer.zero_grad()
-        loss.backward()
-        # Check for overflow
-        has_overflow = DynamicLossScaler.has_overflow(parameters)
-        # If no overflow, unscale grad and update as usual
-        if not has_overflow:
-            for param in parameters:
-                param.grad.data.mul_(1. / loss_scaler.loss_scale)
-            optimizer.step()
-        # Otherwise, don't do anything -- ie, skip iteration
-        else:
-            print('OVERFLOW!')
-        # Update loss scale for next iteration
-        loss_scaler.update_scale(has_overflow)
-"""
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -30,9 +30,16 @@ def import_layernorm(fp32_residual_connection):
 from .distributed import *
-from .bert_model import BertModel, BertModelFirstStage, BertModelIntermediateStage, BertModelLastStage
+from .bert_model import (BertModel,
-from .realm_model import ICTBertModel
+                         BertModelFirstStage,
-from .gpt2_model import GPT2Model, GPT2ModelFirstStage, GPT2ModelIntermediateStage, GPT2ModelLastStage
+                         BertModelIntermediateStage,
+                         BertModelLastStage)
+from .gpt2_model import (GPT2Model,
+                         GPT2ModelFirstStage,
+                         GPT2ModelIntermediateStage,
+                         GPT2ModelLastStage)
 from .language_model import get_language_model
+from .module import FP16Module
+from .realm_model import ICTBertModel
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -26,7 +26,7 @@ from megatron.model.utils import openai_gelu, erf_gelu
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.utils import scaled_init_method_normal
-from megatron.module import MegatronModule, PipelinedMegatronModule
+from .module import MegatronModule
 def bert_attention_mask_func(attention_scores, attention_mask):
    attention_scores.masked_fill_(attention_mask, -10000.0)
@@ -127,7 +127,7 @@ def post_language_model_processing(lm_output, pooled_output,
        return lm_loss, binary_logits
-class BertModelBase(PipelinedMegatronModule):
+class BertModelBase(MegatronModule):
    """Bert Language model."""
    def __init__(self, num_tokentypes=2, add_binary_head=True,

--- a/megatron/model/classification.py
+++ b/megatron/model/classification.py
@@ -24,10 +24,10 @@ from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.utils import scaled_init_method_normal
-from megatron.module import PipelinedMegatronModule
+from .module import MegatronModule
-class ClassificationBase(PipelinedMegatronModule):
+class ClassificationBase(MegatronModule):
    def __init__(self, num_classes, num_tokentypes=2):
        super(ClassificationBase, self).__init__(share_word_embeddings=False)

--- a/megatron/model/distributed.py
+++ b/megatron/model/distributed.py
@@ -20,7 +20,7 @@ from torch.nn.modules import Module
 from torch.autograd import Variable
 from megatron import mpu
-from megatron.module import MegatronModule
+from .module import MegatronModule
 class DistributedDataParallel(MegatronModule):

--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -19,7 +19,7 @@ import torch
 from megatron import get_args
 from megatron import mpu
-from megatron.module import PipelinedMegatronModule
+from .module import MegatronModule
 from .language_model import parallel_lm_logits
 from .language_model import get_language_model
@@ -61,7 +61,7 @@ def post_language_model_processing(lm_output, labels, logit_weights,
        return loss
-class GPT2ModelBase(PipelinedMegatronModule):
+class GPT2ModelBase(MegatronModule):
    """GPT-2 Language model."""
    def __init__(self, num_tokentypes=0, parallel_output=True):

--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -20,7 +20,7 @@ import torch.nn.functional as F
 from megatron import get_args
 from megatron import mpu
-from megatron.module import MegatronModule
+from .module import MegatronModule
 from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal, scaled_init_method_normal

--- a/megatron/module.py
+++ b/megatron/module.py
@@ -16,16 +16,25 @@
 """Megatron Module"""
 import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
 from megatron import get_args
 from megatron import mpu
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
 class MegatronModule(torch.nn.Module):
-    """Megatron specific extensions of torch Module."""
+    """Megatron specific extensions of torch Module with support
+    for pipelining."""
-    def __init__(self):
+    def __init__(self, share_word_embeddings=True):
        super(MegatronModule, self).__init__()
+        self.share_word_embeddings = share_word_embeddings
    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                       keep_vars=False):
@@ -34,53 +43,115 @@ class MegatronModule(torch.nn.Module):
        return self.state_dict(destination, prefix, keep_vars)
-class PipelinedMegatronModule(MegatronModule):
-    """Pipelining specific extensions of MegatronModule."""
-    def __init__(self, share_word_embeddings=True):
-        super(PipelinedMegatronModule, self).__init__()
-        args = get_args()
-        self.share_word_embeddings = share_word_embeddings
    def word_embeddings_weight(self):
        if mpu.is_pipeline_first_stage():
            return self.language_model.embedding.word_embeddings.weight
        if mpu.is_pipeline_last_stage():
            if not self.share_word_embeddings:
-                raise Exception('word_embeddings_weight() called for last stage, '
+                raise Exception('word_embeddings_weight() called for last '
-                                'but share_word_embeddings is false')
+                                'stage, but share_word_embeddings is false')
            return self.word_embeddings.weight
        raise Exception('word_embeddings_weight() should be '
                        'called for first and last stage only')
    def initialize_word_embeddings(self, init_method_normal):
        args = get_args()
        if not self.share_word_embeddings:
            raise Exception('initialize_word_embeddings() was called but '
                            'share_word_embeddings is false')
-        # Parameters are shared between the word embeddings layer, and the heads at
+        # Parameters are shared between the word embeddings layer, and the
-        # the end of the model. In a pipelined setup with more than one stage, the
+        # heads at the end of the model. In a pipelined setup with more than
-        # initial embedding layer and the head are on different workers, so we do
+        # one stage, the initial embedding layer and the head are on different
-        # the following:
+        # workers, so we do the following:
-        # 1. Create a second copy of word_embeddings on the last stage, with initial
+        # 1. Create a second copy of word_embeddings on the last stage, with
-        #    parameters of 0.0.
+        #    initial parameters of 0.0.
-        # 2. Do an all-reduce between the first and last stage to ensure that the
+        # 2. Do an all-reduce between the first and last stage to ensure that
-        #    two copies of word_embeddings start off with the same parameter values.
+        #    the two copies of word_embeddings start off with the same
-        # 3. In the training loop, before an all-reduce between the grads of the two
+        #    parameter values.
-        #    word_embeddings layers to ensure that every applied weight update is the
+        # 3. In the training loop, before an all-reduce between the grads of
-        #    same on both stages.
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
        if mpu.is_pipeline_last_stage():
            if not mpu.is_pipeline_first_stage():
                self._word_embeddings_for_head_key = 'word_embeddings_for_head'
                # If first and last stages are different, set word_embeddings
-                # weights to 0 here, then copy first stage's weights using all_reduce
+                # weights to 0 here, then copy first stage's weights using
-                # below.
+                # all_reduce below.
                self.word_embeddings = mpu.VocabParallelEmbedding(
                    args.padded_vocab_size, args.hidden_size,
                    init_method=init_method_normal(args.init_method_std))
                self.word_embeddings.weight.data.fill_(0)
                self.word_embeddings.weight.shared = True
-        # Ensure that first and last stages have the same initial parameter values.
+        # Ensure that first and last stages have the same initial parameter
+        # values.
        if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage():
            torch.distributed.all_reduce(self.word_embeddings_weight().data,
                                         group=mpu.get_embedding_group())
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+def fp32_to_fp16(val):
+    """Convert fp32 `val` to fp16"""
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = val.half()
+        return val
+    return conversion_helper(val, half_conversion)
+def fp16_to_fp32(val):
+    """Convert fp16 `val` to fp32"""
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _HALF_TYPES):
+            val = val.float()
+        return val
+    return conversion_helper(val, float_conversion)
+class FP16Module(MegatronModule):
+    def __init__(self, module):
+        super(FP16Module, self).__init__()
+        self.add_module('module', module.half())
+    def forward(self, *inputs, **kwargs):
+        if mpu.is_pipeline_first_stage():
+            inputs = fp32_to_fp16(inputs)
+        outputs = self.module(*inputs, **kwargs)
+        if mpu.is_pipeline_last_stage():
+            outputs = fp16_to_fp32(outputs)
+        return outputs
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(destination, prefix,
+                                                          keep_vars)
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
@@ -24,10 +24,10 @@ from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.utils import scaled_init_method_normal
-from megatron.module import PipelinedMegatronModule
+from .module import MegatronModule
-class MultipleChoiceBase(PipelinedMegatronModule):
+class MultipleChoiceBase(MegatronModule):
    def __init__(self, num_tokentypes=2):
        super(MultipleChoiceBase, self).__init__(share_word_embeddings=False)

--- a/megatron/model/realm_model.py
+++ b/megatron/model/realm_model.py
@@ -4,7 +4,7 @@ import torch
 from megatron import get_args, print_rank_0
 from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
 from megatron.model import BertModel
-from megatron.module import MegatronModule
+from .module import MegatronModule
 from megatron import mpu
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -21,7 +21,7 @@ import torch.nn.functional as F
 from megatron import get_args
 from megatron import mpu
-from megatron.module import MegatronModule
+from .module import MegatronModule
 from megatron.checkpointing import get_checkpoint_version
 from megatron.model import import_layernorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -37,7 +37,7 @@ from megatron import print_rank_0
 from megatron import print_rank_last
 from megatron.checkpointing import load_checkpoint
 from megatron.checkpointing import save_checkpoint
-from megatron.fp16 import FP16_Module
+from megatron.model import FP16Module
 from megatron.optimizer import get_megatron_optimizer
 from megatron.initialize import initialize_megatron
@@ -199,7 +199,7 @@ def get_model(model_provider_func):
    # Fp16 conversion.
    if args.fp16:
-        model = FP16_Module(model)
+        model = FP16Module(model)
    if args.DDP_impl == 'torch':
        i = torch.cuda.current_device()
@@ -264,7 +264,7 @@ def setup_model_and_optimizer(model_provider_func):
    model = get_model(model_provider_func)
    unwrapped_model = model
-    while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16_Module)):
+    while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16Module)):
        unwrapped_model = unwrapped_model.module
    optimizer = get_megatron_optimizer(unwrapped_model)
@@ -588,7 +588,7 @@ def train_step(forward_step_func, data_iterator,
    if (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()) and \
            mpu.get_pipeline_model_parallel_world_size() > 1:
        unwrapped_model = model
-        while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16_Module)):
+        while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16Module)):
            unwrapped_model = unwrapped_model.module
        if unwrapped_model.share_word_embeddings:

--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -24,7 +24,6 @@ from megatron import print_rank_0
 from megatron import get_adlr_autoresume
 from megatron import mpu
 from megatron.checkpointing import save_checkpoint
-from megatron.fp16 import FP16_Optimizer
 def average_losses_across_data_parallel_group(losses):
@@ -47,11 +46,13 @@ def report_memory(name):
        torch.cuda.memory_allocated() / mega_bytes)
    string += ' | max allocated: {}'.format(
        torch.cuda.max_memory_allocated() / mega_bytes)
-    string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes)
+    string += ' | reserved: {}'.format(
+        torch.cuda.memory_reserved() / mega_bytes)
    string += ' | max reserved: {}'.format(
        torch.cuda.max_memory_reserved() / mega_bytes)
    if mpu.get_data_parallel_rank() == 0:
-        print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True)
+        print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
+              flush=True)
 def print_params_min_max_norm(optimizer, iteration):
@@ -59,9 +60,7 @@ def print_params_min_max_norm(optimizer, iteration):
    index = 0
    rank = torch.distributed.get_rank()
    string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n'
-    optimizer_ = optimizer
+    optimizer_ = optimizer.optimizer
-    if isinstance(optimizer, FP16_Optimizer):
-        optimizer_ = optimizer.optimizer
    for param_group in optimizer_.param_groups:
        for param in param_group['params']:
            index += 1