moved module to model and removed fp16

b84d7a90 · mohammad · 97ba5c0e · 97ba5c0e · 97ba5c0e · 97ba5c0e
Commit b84d7a90 authored Dec 26, 2020 by mohammad
17 changed files
--- a/megatron/fp16/__init__.py
+++ b/megatron/fp16/__init__.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .fp16util import (
-    BN_convert_float,
-    network_to_half,
-    prep_param_lists,
-    model_grads_to_master_grads,
-    master_params_to_model_params,
-    tofp16,
-    to_python_float,
-    clip_grad_norm,
-    convert_module,
-    convert_network,
-    FP16Model,
-)
-from .fp16 import *
-from .loss_scaler import *
--- a/megatron/fp16/fp16.py
+++ b/megatron/fp16/fp16.py
--- a/megatron/fp16/fp16util.py
+++ b/megatron/fp16/fp16util.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-from megatron import mpu
-class tofp16(nn.Module):
-    """
-    Utility module that implements::
-        def forward(self, input):
-            return input.half()
-    """
-    def __init__(self):
-        super(tofp16, self).__init__()
-    def forward(self, input):
-        return input.half()
-def BN_convert_float(module):
-    """
-    Utility function for network_to_half().
-    Retained for legacy purposes.
-    """
-    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-        module.float()
-    for child in module.children():
-        BN_convert_float(child)
-    return module
-def network_to_half(network):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-    Retained for legacy purposes. It is recommended to use FP16Model.
-    """
-    return nn.Sequential(tofp16(), BN_convert_float(network.half()))
-def convert_module(module, dtype):
-    """
-    Converts a module's immediate parameters and buffers to dtype.
-    """
-    for param in module.parameters(recurse=False):
-        if param is not None:
-            if param.data.dtype.is_floating_point:
-                param.data = param.data.to(dtype=dtype)
-            if param._grad is not None and param._grad.data.dtype.is_floating_point:
-                param._grad.data = param._grad.data.to(dtype=dtype)
-    for buf in module.buffers(recurse=False):
-        if buf is not None and buf.data.dtype.is_floating_point:
-            buf.data = buf.data.to(dtype=dtype)
-def convert_network(network, dtype):
-    """
-    Converts a network's parameters and buffers to dtype.
-    """
-    for module in network.modules():
-        if isinstance(module, torch.nn.modules.batchnorm._BatchNorm) and module.affine is True:
-            continue
-        convert_module(module, dtype)
-    return network
-class FP16Model(nn.Module):
-    """
-    Convert model to half precision in a batchnorm-safe way.
-    """
-    def __init__(self, network):
-        super(FP16Model, self).__init__()
-        self.network = convert_network(network, dtype=torch.half)
-    def forward(self, *inputs):
-        inputs = tuple(t.half() for t in inputs)
-        return self.network(*inputs)
-def backwards_debug_hook(grad):
-    raise RuntimeError("master_params recieved a gradient in the backward pass!")
-def prep_param_lists(model, flat_master=False):
-    """
-    Creates a list of FP32 master parameters for a given model, as in
-    `Training Neural Networks with Mixed Precision:  Real Examples`_.
-    Args:
-        model (torch.nn.Module): Existing Pytorch model
-        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
-    Returns:
-        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
-    Example::
-        model_params, master_params = prep_param_lists(model)
-    .. warning::
-        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
-    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
-        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
-    """
-    model_params = [param for param in model.parameters() if param.requires_grad]
-    if flat_master:
-        # Give the user some more useful error messages
-        try:
-            # flatten_dense_tensors returns a contiguous flat array.
-            # http://pytorch.org/docs/master/_modules/torch/_utils.html
-            master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
-        except BaseException:
-            print("Error in prep_param_lists:  model may contain a mixture of parameters "
-                  "of different types.  Use flat_master=False, or use F16_Optimizer.")
-            raise
-        master_params = torch.nn.Parameter(master_params)
-        master_params.requires_grad = True
-        # master_params.register_hook(backwards_debug_hook)
-        if master_params.grad is None:
-            master_params.grad = master_params.new(*master_params.size())
-        return model_params, [master_params]
-    else:
-        master_params = [param.clone().float().detach() for param in model_params]
-        for param in master_params:
-            param.requires_grad = True
-        return model_params, master_params
-def model_grads_to_master_grads(model_params, master_params, flat_master=False):
-    """
-    Copy model gradients to master gradients.
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
-    """
-    if flat_master:
-        # The flattening may incur one more deep copy than is necessary.
-        master_params[0].grad.data.copy_(
-            _flatten_dense_tensors([p.grad.data for p in model_params]))
-    else:
-        for model, master in zip(model_params, master_params):
-            if model.grad is not None:
-                if master.grad is None:
-                    master.grad = Variable(master.data.new(*master.data.size()))
-            else:
-                master.grad = None
-        model_grads = [p.grad for p in model_params if p.grad is not None]
-        master_grads = [p.grad for p in master_params if p.grad is not None]
-        _overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             _overflow_buf,
-                             [model_grads, master_grads],
-                             1.0)
-def master_params_to_model_params(model_params, master_params, flat_master=False):
-    """
-    Copy master parameters to model parameters.
-    Args:
-        model_params:  List of model parameters created by :func:`prep_param_lists`.
-        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`master_params_to_model_params`.
-    """
-    if flat_master:
-        for model, master in zip(model_params,
-                                 _unflatten_dense_tensors(master_params[0].data, model_params)):
-            model.data.copy_(master)
-    else:
-        for model, master in zip(model_params, master_params):
-            model.data.copy_(master.data)
-# Backward compatibility fixes
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-clip_grad_norm = None #mpu.clip_grad_norm
-# elif TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
-#    clip_grad_norm = torch.nn.utils.clip_grad_norm
-# else:
-#    clip_grad_norm = torch.nn.utils.clip_grad_norm_
--- a/megatron/fp16/loss_scaler.py
+++ b/megatron/fp16/loss_scaler.py
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import torch
-from apex.multi_tensor_apply import multi_tensor_applier
-import amp_C
-from megatron import mpu
-# item() is a recent addition, so this helps with backward compatibility.
-def to_python_float(t):
-    if hasattr(t, 'item'):
-        return t.item()
-    else:
-        return t[0]
-class LossScaler:
-    """
-    Class that manages a static loss scale.  This class is intended to interact with
-    :class:`FP16_Optimizer`, and should not be directly manipulated by the user.
-    Use of :class:`LossScaler` is enabled via the ``static_loss_scale`` argument to
-    :class:`FP16_Optimizer`'s constructor.
-    Args:
-        scale (float, optional, default=1.0):  The loss scale.
-    """
-    def __init__(self, scale=1):
-        self.cur_scale = scale
-    # `params` is a list / generator of torch.Variable
-    def has_overflow(self, params):
-        return False
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        return False
-    def update_scale(self, overflow):
-        pass
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-    def scale_gradient(self, module, grad_in, grad_out):
-        _overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             _overflow_buf,
-                             [grad_in, grad_in],
-                             self.loss_scale)
-        return grad_in
-    def backward(self, output_tensor, retain_graph=False, output_tensor_grad=None):
-        # If output_tensor_grad is None, this is the last stage, and
-        # output_tensor is actually the loss and needs to be scaled.
-        # Otherwise, output_tensor does not need to be scaled again since
-        # output_tensor_grad is already scaled.
-        if output_tensor_grad is None:
-            scaled_output_tensor = output_tensor * self.loss_scale
-        else:
-            scaled_output_tensor = output_tensor
-        torch.autograd.backward(scaled_output_tensor, grad_tensors=output_tensor_grad,
-                                retain_graph=retain_graph)
-class DynamicLossScaler:
-    """
-    Class that manages dynamic loss scaling.  It is recommended to use :class:`DynamicLossScaler`
-    indirectly, by supplying ``dynamic_loss_scale=True`` to the constructor of
-    :class:`FP16_Optimizer`.  However, it's important to understand how :class:`DynamicLossScaler`
-    operates, because the default options can be changed using the
-    the ``dynamic_loss_args`` argument to :class:`FP16_Optimizer`'s constructor.
-    Loss scaling is designed to combat the problem of underflowing gradients encountered at long
-    times when training fp16 networks.  Dynamic loss scaling begins by attempting a very high loss
-    scale.  Ironically, this may result in OVERflowing gradients.  If overflowing gradients are
-    encountered, :class:`DynamicLossScaler` informs :class:`FP16_Optimizer` that an overflow has
-    occurred.
-    :class:`FP16_Optimizer` then skips the update step for this particular iteration/minibatch,
-    and :class:`DynamicLossScaler` adjusts the loss scale to a lower value.
-    If a certain number of iterations occur without overflowing gradients detected,
-    :class:`DynamicLossScaler` increases the loss scale once more.
-    In this way :class:`DynamicLossScaler` attempts to "ride the edge" of
-    always using the highest loss scale possible without incurring overflow.
-    Args:
-        init_scale (float, optional, default=2**32):  Initial loss scale attempted by :class:`DynamicLossScaler.`
-        scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
-        scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
-    """
-    def __init__(self,
-                 init_scale=2**32,
-                 scale_factor=2.,
-                 scale_window=1000,
-                 min_scale=1,
-                 delayed_shift=1,
-                 consecutive_hysteresis=False):
-        self.cur_scale = init_scale
-        self.cur_iter = 0
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-        self.min_scale = min_scale
-        self.delayed_shift = delayed_shift
-        self.cur_hysteresis = delayed_shift
-        self.consecutive_hysteresis = consecutive_hysteresis
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params):
-        for p in params:
-            if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
-                return True
-        return False
-    def has_overflow(self, params):
-        overflow = self.has_overflow_serial(params)
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        overflow_gpu = torch.cuda.ByteTensor([overflow])
-        torch.distributed.all_reduce(overflow_gpu,
-                                     op=torch.distributed.ReduceOp.MAX,
-                                     group=mpu.get_model_parallel_group())
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-    # `x` is a torch.Tensor
-    def _has_inf_or_nan(x):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-    # `overflow` is boolean indicating whether the gradient overflowed
-    def update_scale(self, overflow):
-        if not hasattr(self, 'min_scale'):
-            self.min_scale = 1
-        if not hasattr(self, 'delayed_shift'):
-            self.delayed_shift = 1
-        if not hasattr(self, 'cur_hysteresis'):
-            self.cur_hysteresis = 1
-        if not hasattr(self, 'consecutive_hysteresis'):
-            self.consecutive_hysteresis = True
-        if overflow:
-            # self.cur_scale /= self.scale_factor
-            if self.delayed_shift == 1 or self.cur_hysteresis == 1:
-                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
-            else:
-                self.cur_hysteresis -= 1
-            self.last_overflow_iter = self.cur_iter
-        else:
-            if self.consecutive_hysteresis:
-                self.cur_hysteresis = self.delayed_shift
-            if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
-                if not self.consecutive_hysteresis:
-                    self.cur_hysteresis = self.delayed_shift
-                self.cur_scale *= self.scale_factor
-        self.cur_iter += 1
-    @property
-    def loss_scale(self):
-        return self.cur_scale
-    def scale_gradient(self, module, grad_in, grad_out):
-        _overflow_buf = torch.cuda.IntTensor([0])
-        multi_tensor_applier(amp_C.multi_tensor_scale,
-                             _overflow_buf,
-                             [grad_in, grad_in],
-                             self.loss_scale)
-        return grad_in
-    def backward(self, output_tensor, retain_graph=False, output_tensor_grad=None):
-        # If output_tensor_grad is None, this is the last stage, and
-        # output_tensor is actually the loss and needs to be scaled.
-        # Otherwise, output_tensor does not need to be scaled again since
-        # output_tensor_grad is already scaled.
-        if output_tensor_grad is None:
-            scaled_output_tensor = output_tensor * self.loss_scale
-        else:
-            scaled_output_tensor = output_tensor
-        torch.autograd.backward(scaled_output_tensor, grad_tensors=output_tensor_grad,
-                                retain_graph=retain_graph)
-##############################################################
-# Example usage below here -- assuming it's in a separate file
-##############################################################
-"""
-TO-DO separate out into an example.
-if __name__ == "__main__":
-    import torch
-    from torch.autograd import Variable
-    from dynamic_loss_scaler import DynamicLossScaler
-    # N is batch size; D_in is input dimension;
-    # H is hidden dimension; D_out is output dimension.
-    N, D_in, H, D_out = 64, 1000, 100, 10
-    # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
-    x = Variable(torch.randn(N, D_in), requires_grad=False)
-    y = Variable(torch.randn(N, D_out), requires_grad=False)
-    w1 = Variable(torch.randn(D_in, H), requires_grad=True)
-    w2 = Variable(torch.randn(H, D_out), requires_grad=True)
-    parameters = [w1, w2]
-    learning_rate = 1e-6
-    optimizer = torch.optim.SGD(parameters, lr=learning_rate)
-    loss_scaler = DynamicLossScaler()
-    for t in range(500):
-        y_pred = x.mm(w1).clamp(min=0).mm(w2)
-        loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
-        print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
-        print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
-        print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
-        # Run backprop
-        optimizer.zero_grad()
-        loss.backward()
-        # Check for overflow
-        has_overflow = DynamicLossScaler.has_overflow(parameters)
-        # If no overflow, unscale grad and update as usual
-        if not has_overflow:
-            for param in parameters:
-                param.grad.data.mul_(1. / loss_scaler.loss_scale)
-            optimizer.step()
-        # Otherwise, don't do anything -- ie, skip iteration
-        else:
-            print('OVERFLOW!')
-        # Update loss scale for next iteration
-        loss_scaler.update_scale(has_overflow)
-"""
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -30,9 +30,16 @@ def import_layernorm(fp32_residual_connection):
 from .distributed import *
-from .bert_model import BertModel, BertModelFirstStage, BertModelIntermediateStage, BertModelLastStage
+from .bert_model import (BertModel,
-from .realm_model import ICTBertModel
+                         BertModelFirstStage,
-from .gpt2_model import GPT2Model, GPT2ModelFirstStage, GPT2ModelIntermediateStage, GPT2ModelLastStage
+                         BertModelIntermediateStage,
+                         BertModelLastStage)
+from .gpt2_model import (GPT2Model,
+                         GPT2ModelFirstStage,
+                         GPT2ModelIntermediateStage,
+                         GPT2ModelLastStage)
 from .language_model import get_language_model
+from .module import FP16Module
+from .realm_model import ICTBertModel
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -26,7 +26,7 @@ from megatron.model.utils import openai_gelu, erf_gelu
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.utils import scaled_init_method_normal
-from megatron.module import MegatronModule, PipelinedMegatronModule
+from .module import MegatronModule
 def bert_attention_mask_func(attention_scores, attention_mask):
    attention_scores.masked_fill_(attention_mask, -10000.0)
@@ -127,7 +127,7 @@ def post_language_model_processing(lm_output, pooled_output,
        return lm_loss, binary_logits
-class BertModelBase(PipelinedMegatronModule):
+class BertModelBase(MegatronModule):
    """Bert Language model."""
    def __init__(self, num_tokentypes=2, add_binary_head=True,

--- a/megatron/model/classification.py
+++ b/megatron/model/classification.py
@@ -24,10 +24,10 @@ from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.utils import scaled_init_method_normal
-from megatron.module import PipelinedMegatronModule
+from .module import MegatronModule
-class ClassificationBase(PipelinedMegatronModule):
+class ClassificationBase(MegatronModule):
    def __init__(self, num_classes, num_tokentypes=2):
        super(ClassificationBase, self).__init__(share_word_embeddings=False)

--- a/megatron/model/distributed.py
+++ b/megatron/model/distributed.py
@@ -20,7 +20,7 @@ from torch.nn.modules import Module
 from torch.autograd import Variable
 from megatron import mpu
-from megatron.module import MegatronModule
+from .module import MegatronModule
 class DistributedDataParallel(MegatronModule):

--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -19,7 +19,7 @@ import torch
 from megatron import get_args
 from megatron import mpu
-from megatron.module import PipelinedMegatronModule
+from .module import MegatronModule
 from .language_model import parallel_lm_logits
 from .language_model import get_language_model
@@ -61,7 +61,7 @@ def post_language_model_processing(lm_output, labels, logit_weights,
        return loss
-class GPT2ModelBase(PipelinedMegatronModule):
+class GPT2ModelBase(MegatronModule):
    """GPT-2 Language model."""
    def __init__(self, num_tokentypes=0, parallel_output=True):

--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -20,7 +20,7 @@ import torch.nn.functional as F
 from megatron import get_args
 from megatron import mpu
-from megatron.module import MegatronModule
+from .module import MegatronModule
 from megatron.model.transformer import ParallelTransformer
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal, scaled_init_method_normal

--- a/megatron/module.py
+++ b/megatron/module.py
@@ -16,16 +16,25 @@
 """Megatron Module"""
 import torch
+from torch.autograd import Variable
+from torch.nn.parameter import Parameter
 from megatron import get_args
 from megatron import mpu
+_FLOAT_TYPES = (torch.FloatTensor, torch.cuda.FloatTensor)
+_HALF_TYPES = (torch.HalfTensor, torch.cuda.HalfTensor)
 class MegatronModule(torch.nn.Module):
-    """Megatron specific extensions of torch Module."""
+    """Megatron specific extensions of torch Module with support
+    for pipelining."""
-    def __init__(self):
+    def __init__(self, share_word_embeddings=True):
        super(MegatronModule, self).__init__()
+        self.share_word_embeddings = share_word_embeddings
    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
                                       keep_vars=False):
@@ -34,53 +43,115 @@ class MegatronModule(torch.nn.Module):
        return self.state_dict(destination, prefix, keep_vars)
-class PipelinedMegatronModule(MegatronModule):
-    """Pipelining specific extensions of MegatronModule."""
-    def __init__(self, share_word_embeddings=True):
-        super(PipelinedMegatronModule, self).__init__()
-        args = get_args()
-        self.share_word_embeddings = share_word_embeddings
    def word_embeddings_weight(self):
        if mpu.is_pipeline_first_stage():
            return self.language_model.embedding.word_embeddings.weight
        if mpu.is_pipeline_last_stage():
            if not self.share_word_embeddings:
-                raise Exception('word_embeddings_weight() called for last stage, '
+                raise Exception('word_embeddings_weight() called for last '
-                                'but share_word_embeddings is false')
+                                'stage, but share_word_embeddings is false')
            return self.word_embeddings.weight
        raise Exception('word_embeddings_weight() should be '
                        'called for first and last stage only')
    def initialize_word_embeddings(self, init_method_normal):
        args = get_args()
        if not self.share_word_embeddings:
            raise Exception('initialize_word_embeddings() was called but '
                            'share_word_embeddings is false')
-        # Parameters are shared between the word embeddings layer, and the heads at
+        # Parameters are shared between the word embeddings layer, and the
-        # the end of the model. In a pipelined setup with more than one stage, the
+        # heads at the end of the model. In a pipelined setup with more than
-        # initial embedding layer and the head are on different workers, so we do
+        # one stage, the initial embedding layer and the head are on different
-        # the following:
+        # workers, so we do the following:
-        # 1. Create a second copy of word_embeddings on the last stage, with initial
+        # 1. Create a second copy of word_embeddings on the last stage, with
-        #    parameters of 0.0.
+        #    initial parameters of 0.0.
-        # 2. Do an all-reduce between the first and last stage to ensure that the
+        # 2. Do an all-reduce between the first and last stage to ensure that
-        #    two copies of word_embeddings start off with the same parameter values.
+        #    the two copies of word_embeddings start off with the same
-        # 3. In the training loop, before an all-reduce between the grads of the two
+        #    parameter values.
-        #    word_embeddings layers to ensure that every applied weight update is the
+        # 3. In the training loop, before an all-reduce between the grads of
-        #    same on both stages.
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
        if mpu.is_pipeline_last_stage():
            if not mpu.is_pipeline_first_stage():
                self._word_embeddings_for_head_key = 'word_embeddings_for_head'
                # If first and last stages are different, set word_embeddings
-                # weights to 0 here, then copy first stage's weights using all_reduce
+                # weights to 0 here, then copy first stage's weights using
-                # below.
+                # all_reduce below.
                self.word_embeddings = mpu.VocabParallelEmbedding(
                    args.padded_vocab_size, args.hidden_size,
                    init_method=init_method_normal(args.init_method_std))
                self.word_embeddings.weight.data.fill_(0)
                self.word_embeddings.weight.shared = True
-        # Ensure that first and last stages have the same initial parameter values.
+        # Ensure that first and last stages have the same initial parameter
+        # values.
        if mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage():
            torch.distributed.all_reduce(self.word_embeddings_weight().data,
                                         group=mpu.get_embedding_group())
+def conversion_helper(val, conversion):
+    """Apply conversion to val. Recursively apply conversion if `val`
+    #is a nested tuple/list structure."""
+    if not isinstance(val, (tuple, list)):
+        return conversion(val)
+    rtn = [conversion_helper(v, conversion) for v in val]
+    if isinstance(val, tuple):
+        rtn = tuple(rtn)
+    return rtn
+def fp32_to_fp16(val):
+    """Convert fp32 `val` to fp16"""
+    def half_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _FLOAT_TYPES):
+            val = val.half()
+        return val
+    return conversion_helper(val, half_conversion)
+def fp16_to_fp32(val):
+    """Convert fp16 `val` to fp32"""
+    def float_conversion(val):
+        val_typecheck = val
+        if isinstance(val_typecheck, (Parameter, Variable)):
+            val_typecheck = val.data
+        if isinstance(val_typecheck, _HALF_TYPES):
+            val = val.float()
+        return val
+    return conversion_helper(val, float_conversion)
+class FP16Module(MegatronModule):
+    def __init__(self, module):
+        super(FP16Module, self).__init__()
+        self.add_module('module', module.half())
+    def forward(self, *inputs, **kwargs):
+        if mpu.is_pipeline_first_stage():
+            inputs = fp32_to_fp16(inputs)
+        outputs = self.module(*inputs, **kwargs)
+        if mpu.is_pipeline_last_stage():
+            outputs = fp16_to_fp32(outputs)
+        return outputs
+    def state_dict(self, destination=None, prefix='', keep_vars=False):
+        return self.module.state_dict(destination, prefix, keep_vars)
+    def state_dict_for_save_checkpoint(self, destination=None, prefix='',
+                                       keep_vars=False):
+        return self.module.state_dict_for_save_checkpoint(destination, prefix,
+                                                          keep_vars)
+    def load_state_dict(self, state_dict, strict=True):
+        self.module.load_state_dict(state_dict, strict=strict)
--- a/megatron/model/multiple_choice.py
+++ b/megatron/model/multiple_choice.py
@@ -24,10 +24,10 @@ from megatron.model.language_model import get_language_model
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
 from megatron.model.utils import scaled_init_method_normal
-from megatron.module import PipelinedMegatronModule
+from .module import MegatronModule
-class MultipleChoiceBase(PipelinedMegatronModule):
+class MultipleChoiceBase(MegatronModule):
    def __init__(self, num_tokentypes=2):
        super(MultipleChoiceBase, self).__init__(share_word_embeddings=False)

--- a/megatron/model/realm_model.py
+++ b/megatron/model/realm_model.py
@@ -4,7 +4,7 @@ import torch
 from megatron import get_args, print_rank_0
 from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name
 from megatron.model import BertModel
-from megatron.module import MegatronModule
+from .module import MegatronModule
 from megatron import mpu
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal

--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -21,7 +21,7 @@ import torch.nn.functional as F
 from megatron import get_args
 from megatron import mpu
-from megatron.module import MegatronModule
+from .module import MegatronModule
 from megatron.checkpointing import get_checkpoint_version
 from megatron.model import import_layernorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -37,7 +37,7 @@ from megatron import print_rank_0
 from megatron import print_rank_last
 from megatron.checkpointing import load_checkpoint
 from megatron.checkpointing import save_checkpoint
-from megatron.fp16 import FP16_Module
+from megatron.model import FP16Module
 from megatron.optimizer import get_megatron_optimizer
 from megatron.initialize import initialize_megatron
@@ -199,7 +199,7 @@ def get_model(model_provider_func):
    # Fp16 conversion.
    if args.fp16:
-        model = FP16_Module(model)
+        model = FP16Module(model)
    if args.DDP_impl == 'torch':
        i = torch.cuda.current_device()
@@ -264,7 +264,7 @@ def setup_model_and_optimizer(model_provider_func):
    model = get_model(model_provider_func)
    unwrapped_model = model
-    while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16_Module)):
+    while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16Module)):
        unwrapped_model = unwrapped_model.module
    optimizer = get_megatron_optimizer(unwrapped_model)
@@ -588,7 +588,7 @@ def train_step(forward_step_func, data_iterator,
    if (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()) and \
            mpu.get_pipeline_model_parallel_world_size() > 1:
        unwrapped_model = model
-        while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16_Module)):
+        while isinstance(unwrapped_model, (torchDDP, LocalDDP, FP16Module)):
            unwrapped_model = unwrapped_model.module
        if unwrapped_model.share_word_embeddings:

--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -24,7 +24,6 @@ from megatron import print_rank_0
 from megatron import get_adlr_autoresume
 from megatron import mpu
 from megatron.checkpointing import save_checkpoint
-from megatron.fp16 import FP16_Optimizer
 def average_losses_across_data_parallel_group(losses):
@@ -47,11 +46,13 @@ def report_memory(name):
        torch.cuda.memory_allocated() / mega_bytes)
    string += ' | max allocated: {}'.format(
        torch.cuda.max_memory_allocated() / mega_bytes)
-    string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes)
+    string += ' | reserved: {}'.format(
+        torch.cuda.memory_reserved() / mega_bytes)
    string += ' | max reserved: {}'.format(
        torch.cuda.max_memory_reserved() / mega_bytes)
    if mpu.get_data_parallel_rank() == 0:
-        print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True)
+        print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
+              flush=True)
 def print_params_min_max_norm(optimizer, iteration):
@@ -59,9 +60,7 @@ def print_params_min_max_norm(optimizer, iteration):
    index = 0
    rank = torch.distributed.get_rank()
    string = 'iteration, rank, index, tensor-model-parallel, min, max, norm\n'
-    optimizer_ = optimizer
+    optimizer_ = optimizer.optimizer
-    if isinstance(optimizer, FP16_Optimizer):
-        optimizer_ = optimizer.optimizer
    for param_group in optimizer_.param_groups:
        for param in param_group['params']:
            index += 1