Merge branch 'fp32_residual_conn' into 'main'

Option to run residual connections in fp32 See merge request ADLR/megatron-lm!195

Merge branch 'fp32_residual_conn' into 'main'
Option to run residual connections in fp32 See merge request ADLR/megatron-lm!195
c84f0752 · Deepak Narayanan · 9b174da8 · 83671bbf · c84f0752 · c84f0752
Commit c84f0752 authored Dec 21, 2020 by Deepak Narayanan
14 changed files
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -183,6 +183,9 @@ def parse_args(extra_args_provider=None, defaults={},
    # Mixed precision checks.
    if args.fp16_lm_cross_entropy:
        assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
+    if args.fp32_residual_connection:
+        assert args.fp16, \
+            'residual connection in fp32 only supported when using fp16.'
    # Activation checkpointing.
    if args.distribute_checkpointed_activations:
        assert args.checkpoint_activations, \
@@ -197,6 +200,10 @@ def parse_args(extra_args_provider=None, defaults={},
    if args.scaled_masked_softmax_fusion:
        fused_kernels.load_scaled_masked_softmax_fusion_kernel()

+    # Load mixed precision fused layer norm.
+    if args.fp32_residual_connection:
+        fused_kernels.load_fused_mix_prec_layer_norm_kernel()
+
    _print_args(args)
    return args

@@ -435,6 +442,8 @@ def _add_mixed_precision_args(parser):

    group.add_argument('--fp16', action='store_true',
                       help='Run model in fp16 mode.')
+    group.add_argument('--fp32-residual-connection', action='store_true',
+                       help='Move residual connections to fp32.')
    group.add_argument('--apply-query-key-layer-scaling', action='store_true',
                       help='Scale Q * K^T by 1 / layer-number. If this flag '
                       'is set, then it will automatically set '

--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -98,3 +98,29 @@ def load_scaled_masked_softmax_fusion_kernel():
                           '--expt-relaxed-constexpr',
                           '--expt-extended-lambda',
                           '--use_fast_math'] + cc_flag)
+
+
+def load_fused_mix_prec_layer_norm_kernel():
+
+    # Check, if CUDA11 is installed for compute capability 8.0
+    cc_flag = []
+    _, bare_metal_major, _ = get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+    if int(bare_metal_major) >= 11:
+        cc_flag.append('-gencode')
+        cc_flag.append('arch=compute_80,code=sm_80')
+
+    srcpath = pathlib.Path(__file__).parent.absolute()
+    buildpath = srcpath / 'build'
+
+    create_build_dir(buildpath)
+
+    fused_mix_prec_layer_norm_cuda = cpp_extension.load(
+        name='fused_mix_prec_layer_norm_cuda',
+        sources=[srcpath / 'layer_norm_cuda.cpp',
+                 srcpath / 'layer_norm_cuda_kernel.cu'],
+        build_directory=buildpath,
+        extra_cflags=['-O3'],
+        extra_cuda_cflags=['-O3',
+                           '-gencode', 'arch=compute_70,code=sm_70',
+                           '-maxrregcount=50',
+                           '--use_fast_math'] + cc_flag)
--- a/megatron/fused_kernels/compat.h
+++ b/megatron/fused_kernels/compat.h
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*This code is copied fron NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
--- a/megatron/fused_kernels/layer_norm_cuda.cpp
+++ b/megatron/fused_kernels/layer_norm_cuda.cpp
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*This code is copied fron NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+#include <torch/extension.h>
+#include <vector>
+#include <cassert>
+#include "compat.h"
+
+namespace {
+void compute_n1_n2(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    int& n1,
+    int& n2)
+{
+    int idiff = input.ndimension() - normalized_shape.size();
+    n2 = 1;
+    for (int i = 0;  i < (int)normalized_shape.size();  ++i) {
+	    assert( input.sizes()[i+idiff] == normalized_shape[i] );
+	    n2 *= normalized_shape[i];
+    }
+    n1 = 1;
+    for (int i = 0;  i < idiff;  ++i) {
+	    n1 *= input.sizes()[i];
+    }
+}
+
+void check_args(
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor gamma,
+    at::Tensor beta
+    )
+{
+    TORCH_CHECK(!gamma.defined() || gamma.sizes().equals(normalized_shape));
+    TORCH_CHECK(!beta.defined() || beta.sizes().equals(normalized_shape));
+}
+
+void check_args(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    int& n1,
+    int& n2
+    )
+{
+    int64_t normalized_ndim = normalized_shape.size();
+
+    if (normalized_ndim < 1) {
+      std::stringstream ss;
+      ss << "Expected normalized_shape to be at least 1-dimensional, i.e., "
+         << "containing at least one element, but got normalized_shape="
+         << normalized_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    auto input_shape = input.sizes();
+    auto input_ndim = input.dim();
+
+    if (input_ndim < normalized_ndim ||
+        !input_shape.slice(input_ndim - normalized_ndim).equals(normalized_shape)) {
+      std::stringstream ss;
+      ss << "Given normalized_shape=" << normalized_shape
+         << ", expected input with shape [*";
+      for (auto size : normalized_shape) {
+        ss << ", " << size;
+      }
+      ss << "], but got input of size" << input_shape;
+      throw std::runtime_error(ss.str());
+    }
+
+    compute_n1_n2(input,normalized_shape,n1,n2);
+}
+
+
+void check_args(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor gamma,
+    at::Tensor beta,
+    int& n1,
+    int& n2
+    )
+{
+    check_args(input,normalized_shape,n1,n2);
+    check_args(normalized_shape,gamma,beta);
+}
+}
+
+void cuda_layer_norm(
+    at::Tensor* output,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon);
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<at::Tensor> layer_norm(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    double epsilon) {
+  CHECK_INPUT(input);
+  int n1,n2;
+  check_args(input,normalized_shape,n1,n2);
+  at::Tensor output = at::empty_like(input);
+  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input.scalar_type()));
+  at::Tensor invvar = at::empty_like(mean);
+  cuda_layer_norm(&output,&mean,&invvar,&input,n1,n2,
+      normalized_shape,NULL,NULL,epsilon);
+  return {output, mean, invvar};
+}
+std::vector<at::Tensor> layer_norm_affine(
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor gamma,
+    at::Tensor beta,
+    double epsilon) {
+  CHECK_INPUT(input);
+  CHECK_INPUT(gamma);
+  CHECK_INPUT(beta);
+  int n1,n2;
+  check_args(input,normalized_shape,gamma,beta,n1,n2);
+  at::Tensor output = at::empty_like(input, input.options().dtype(at::ScalarType::Half));
+  at::Tensor mean = at::empty({n1}, input.options().dtype(input.scalar_type()==at::ScalarType::Half ? at::ScalarType::Float : input.scalar_type()));
+  at::Tensor invvar = at::empty_like(mean);
+  cuda_layer_norm(&output,&mean,&invvar,&input,n1,n2,
+      normalized_shape,&gamma,&beta,epsilon);
+  return {output, mean, invvar};
+}
+
+void cuda_layer_norm_gradient(
+    at::Tensor* dout,
+    at::Tensor* mean,
+    at::Tensor* invvar,
+    at::Tensor* input,
+    int n1,
+    int n2,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor* gamma,
+    at::Tensor* beta,
+    double epsilon,
+    at::Tensor* grad_input,
+    at::Tensor* grad_gamma,
+    at::Tensor* grad_beta
+    );
+
+at::Tensor layer_norm_gradient(
+    at::Tensor dout,
+    at::Tensor mean,
+    at::Tensor invvar,
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    double epsilon) {
+  CHECK_INPUT(dout);
+  CHECK_INPUT(mean);
+  CHECK_INPUT(invvar);
+  CHECK_INPUT(input);
+  int n1,n2;
+  check_args(input,normalized_shape,n1,n2);
+  at::Tensor grad_input = at::empty_like(input);
+  cuda_layer_norm_gradient(&dout,&mean,&invvar,&input,n1,n2,
+      normalized_shape,NULL,NULL,epsilon,
+      &grad_input,NULL,NULL);
+  return grad_input;
+}
+std::vector<at::Tensor> layer_norm_gradient_affine(
+    at::Tensor dout,
+    at::Tensor mean,
+    at::Tensor invvar,
+    at::Tensor input,
+    #ifdef VERSION_GE_1_1
+    at::IntArrayRef normalized_shape,
+    #else
+    at::IntList normalized_shape,
+    #endif
+    at::Tensor gamma,
+    at::Tensor beta,
+    double epsilon) {
+  CHECK_INPUT(dout);
+  CHECK_INPUT(mean);
+  CHECK_INPUT(invvar);
+  CHECK_INPUT(input);
+  CHECK_INPUT(gamma);
+  CHECK_INPUT(beta);
+  int n1,n2;
+  check_args(input,normalized_shape,gamma,beta,n1,n2);
+  at::Tensor grad_input = at::empty_like(input);
+  at::Tensor grad_gamma = at::empty_like(gamma);
+  at::Tensor grad_beta = at::empty_like(beta);
+  cuda_layer_norm_gradient(&dout,&mean,&invvar,&input,n1,n2,
+      normalized_shape,&gamma,&beta,epsilon,
+      &grad_input,&grad_gamma,&grad_beta);
+  return {grad_input, grad_gamma, grad_beta};
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward_affine", &layer_norm_affine, "LayerNorm forward (CUDA)");
+  m.def("forward", &layer_norm, "LayerNorm forward (CUDA)");
+  m.def("backward_affine", &layer_norm_gradient_affine, "LayerNorm backward (CUDA)");
+  m.def("backward", &layer_norm_gradient, "LayerNorm backward (CUDA)");
+}
--- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu
+++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
--- a/megatron/fused_kernels/type_shim.h
+++ b/megatron/fused_kernels/type_shim.h
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*This code is copied fron NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+#include <ATen/ATen.h>
+#include "compat.h"
+
+// Forward/backward compatiblity hack around
+// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
+// pending more future-proof guidance from upstream.
+// struct TypeShim
+// {
+//   const at::Type& payload;
+//   TypeShim(const at::Type& type) : payload(type) {}
+//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
+//   operator const at::Type&(){ return payload; };
+//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
+//   //operator at::ScalarType(){ return payload.; };
+// };
+
+#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Half: \
+    { \
+      using scalar_t_##LEVEL = at::Half; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+
+
+#define DISPATCH_FLOAT_HALF_AND_BYTE(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Half: \
+    { \
+      using scalar_t_##LEVEL = at::Half; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Byte: \
+    { \
+      using scalar_t_##LEVEL = uint8_t; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+
+
+#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Double: \
+    { \
+      using scalar_t_##LEVEL = double; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Half: \
+    { \
+      using scalar_t_##LEVEL = at::Half; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+
+
+  #define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...) \
+  switch(TYPE) \
+  { \
+    case at::ScalarType::Double: \
+    { \
+      using scalar_t_##LEVEL = double; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    case at::ScalarType::Float: \
+    { \
+      using scalar_t_##LEVEL = float; \
+      __VA_ARGS__; \
+      break; \
+    } \
+    default: \
+      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");  \
+  }
+
+
+template<typename T>
+__device__ __forceinline__ T reduce_block_into_lanes
+  (T *x,
+   T val,
+   int lanes=1,
+   bool share_result=false) // lanes is intended to be <= 32.
+{
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1)
+  {
+    if(tid < i)
+      x[tid] = x[tid] + x[tid+i];
+    __syncthreads();
+  }
+
+  T final;
+
+  if(tid < 32)
+  {
+    if(blockSize >= 64)
+      final = x[tid] + x[tid+32];
+    else
+      final = val;
+    // __SYNCWARP();
+
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = final + __shfl_down_sync(0xffffffff, final, i);
+  }
+
+  if(share_result)
+  {
+    if(tid < lanes)
+      x[tid] = final; // EpilogueOp
+    // Make sure the smem result is visible to all warps.
+    __syncthreads();
+  }
+
+  return final;
+}
+
+template<typename T>
+__device__ __forceinline__ T reduce_block_into_lanes_max_op
+  (T *x,
+   T val,
+   int lanes=1,
+   bool share_result=false) // lanes is intended to be <= 32.
+{
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1)
+  {
+    if(tid < i)
+      x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid+i]));
+    __syncthreads();
+  }
+
+  T final;
+
+  if(tid < 32)
+  {
+    if(blockSize >= 64)
+      final = fmaxf(fabsf(x[tid]), fabsf(x[tid+32]));
+    else
+      final = val;
+    // __SYNCWARP();
+
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
+  }
+
+  if(share_result)
+  {
+    if(tid < lanes)
+      x[tid] = final; // EpilogueOp
+    // Make sure the smem result is visible to all warps.
+    __syncthreads();
+  }
+
+  return final;
+}
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -13,9 +13,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+_LAYER_NORM = None
+
+
+def import_layernorm(fp32_residual_connection):
+
+    global _LAYER_NORM
+    if not _LAYER_NORM:
+        if fp32_residual_connection:
+            from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
+        else:
+            from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
+        _LAYER_NORM = LayerNorm
+            
+    return _LAYER_NORM
+
+
 from .distributed import *
 from .bert_model import BertModel, BertModelFirstStage, BertModelIntermediateStage, BertModelLastStage
 from .realm_model import ICTBertModel
 from .gpt2_model import GPT2Model, GPT2ModelFirstStage, GPT2ModelIntermediateStage, GPT2ModelLastStage
 from .utils import get_params_for_weight_decay_optimization
 from .language_model import get_language_model
+
+
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -21,7 +21,7 @@ from megatron import get_args
 from megatron import mpu
 from megatron.model.language_model import parallel_lm_logits
 from megatron.model.language_model import get_language_model
-from megatron.model.transformer import LayerNorm
+from megatron.model import import_layernorm
 from megatron.model.utils import openai_gelu, erf_gelu
 from megatron.model.utils import get_linear_layer
 from megatron.model.utils import init_method_normal
@@ -83,6 +83,7 @@ class BertLMHead(MegatronModule):
        self.parallel_output = parallel_output

        self.dense = get_linear_layer(hidden_size, hidden_size, init_method)
+        LayerNorm = import_layernorm(args.fp32_residual_connection)
        self.layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
        self.gelu = torch.nn.functional.gelu
        if args.openai_gelu:

--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This code is copied fron NVIDIA apex:
+      https://github.com/NVIDIA/apex
+   with minor changes. """
+
+
+import math
+import torch
+import numbers
+from torch.nn.parameter import Parameter
+from torch.nn import init
+from torch.nn import functional as F
+import importlib
+
+global fused_layer_norm_cuda
+fused_layer_norm_cuda = None
+global fused_mix_prec_layer_norm_cuda
+fused_mix_prec_layer_norm_cuda = None
+
+class FusedLayerNormAffineFunction(torch.autograd.Function):
+
+  @staticmethod
+  def forward(ctx, input, weight, bias, normalized_shape, eps):
+    global fused_mix_prec_layer_norm_cuda
+    if fused_mix_prec_layer_norm_cuda is None:
+        fused_mix_prec_layer_norm_cuda = importlib.import_module("fused_mix_prec_layer_norm_cuda")
+    ctx.normalized_shape = normalized_shape
+    ctx.eps = eps
+    input_ = input.contiguous()
+    weight_ = weight.contiguous()
+    bias_ = bias.contiguous()
+    output, mean, invvar = fused_mix_prec_layer_norm_cuda.forward_affine(
+        input_, ctx.normalized_shape, weight_, bias_, ctx.eps)
+    ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
+    return output
+
+  @staticmethod
+  def backward(ctx, grad_output):
+    input_, weight_, bias_, mean, invvar = ctx.saved_tensors
+    grad_input = grad_weight = grad_bias = None
+    grad_input, grad_weight, grad_bias = fused_mix_prec_layer_norm_cuda.backward_affine(
+        grad_output.contiguous(), mean, invvar,
+        input_, ctx.normalized_shape,
+        weight_, bias_, ctx.eps)
+    return grad_input, grad_weight, grad_bias, None, None
+
+class FusedLayerNormFunction(torch.autograd.Function):
+
+  @staticmethod
+  def forward(ctx, input, normalized_shape, eps):
+    global fused_layer_norm_cuda
+    if fused_layer_norm_cuda is None:
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+    ctx.normalized_shape = normalized_shape
+    ctx.eps = eps
+    input_ = input.contiguous()
+    output, mean, invvar = fused_layer_norm_cuda.forward(
+        input_, ctx.normalized_shape, ctx.eps)
+    ctx.save_for_backward(input_, mean, invvar)
+    return output
+
+  @staticmethod
+  def backward(ctx, grad_output):
+    input_, mean, invvar = ctx.saved_tensors
+    grad_input = None
+    grad_input = fused_layer_norm_cuda.backward(
+        grad_output.contiguous(), mean, invvar,
+        input_, ctx.normalized_shape,
+        ctx.eps)
+    return grad_input, None, None
+
+def fused_layer_norm_affine(input, normalized_shape, weight, bias, eps=1e-6):
+    return FusedLayerNormAffineFunction.apply(input, weight, bias, normalized_shape, eps)
+
+def fused_layer_norm(input, normalized_shape, eps=1e-6):
+    return FusedLayerNormFunction.apply(input, normalized_shape, eps)
+
+class MixedFusedLayerNorm(torch.nn.Module):
+    r"""Applies Layer Normalization over a mini-batch of inputs as described in
+    the paper `Layer Normalization`_ .
+    Currently only runs on cuda() tensors.
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
+    The mean and standard-deviation are calculated separately over the last
+    certain number dimensions which have to be of the shape specified by
+    :attr:`normalized_shape`.
+    :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
+    :attr:`normalized_shape` if :attr:`elementwise_affine` is ``True``.
+    .. note::
+        Unlike Batch Normalization and Instance Normalization, which applies
+        scalar scale and bias for each entire channel/plane with the
+        :attr:`affine` option, Layer Normalization applies per-element scale and
+        bias with :attr:`elementwise_affine`.
+    This layer uses statistics computed from input data in both training and
+    evaluation modes.
+    Args:
+        normalized_shape (int or list or torch.Size): input shape from an expected input
+            of size
+            .. math::
+                [* \times \text{normalized}\_\text{shape}[0] \times \text{normalized}\_\text{shape}[1]
+                    \times \ldots \times \text{normalized}\_\text{shape}[-1]]
+            If a single integer is used, it is treated as a singleton list, and this module will
+            normalize over the last dimension which is expected to be of that specific size.
+        eps: a value added to the denominator for numerical stability. Default: 1e-5
+        elementwise_affine: a boolean value that when set to ``True``, this module
+            has learnable per-element affine parameters initialized to ones (for weights)
+            and zeros (for biases). Default: ``True``.
+    Shape:
+        - Input: :math:`(N, *)`
+        - Output: :math:`(N, *)` (same shape as input)
+    Examples::
+        >>> input = torch.randn(20, 5, 10, 10)
+        >>> # With Learnable Parameters
+        >>> m = apex.normalization.FusedLayerNorm(input.size()[1:])
+        >>> # Without Learnable Parameters
+        >>> m = apex.normalization.FusedLayerNorm(input.size()[1:], elementwise_affine=False)
+        >>> # Normalize over last two dimensions
+        >>> m = apex.normalization.FusedLayerNorm([10, 10])
+        >>> # Normalize over last dimension of size 10
+        >>> m = apex.normalization.FusedLayerNorm(10)
+        >>> # Activating the module
+        >>> output = m(input)
+    .. _`Layer Normalization`: https://arxiv.org/abs/1607.06450
+    """
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super(MixedFusedLayerNorm, self).__init__()
+
+        global fused_layer_norm_cuda
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+        global fused_mix_prec_layer_norm_cuda
+        fused_mix_prec_layer_norm_cuda = importlib.import_module("fused_mix_prec_layer_norm_cuda")
+
+
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            self.weight = Parameter(torch.Tensor(*normalized_shape))
+            self.bias = Parameter(torch.Tensor(*normalized_shape))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.elementwise_affine:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input):
+        if not input.is_cuda:
+            return  F.layer_norm(
+                input, self.normalized_shape, self.weight, self.bias, self.eps)
+        
+        if self.elementwise_affine:
+          return FusedLayerNormAffineFunction.apply(
+              input, self.weight, self.bias, self.normalized_shape,self.eps)
+        else:
+          return FusedLayerNormFunction.apply(input, self.normalized_shape, self.eps) 
+
+    def extra_repr(self):
+        return '{normalized_shape}, eps={eps}, ' \
+            'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -21,9 +21,9 @@ import torch.nn.functional as F

 from megatron import get_args
 from megatron import mpu
-from megatron.mpu import LayerNorm
 from megatron.module import MegatronModule
 from megatron.checkpointing import get_checkpoint_version
+from megatron.model import import_layernorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
 from megatron.model.fused_bias_gelu import bias_gelu_impl
 from megatron.model.utils import openai_gelu, erf_gelu
@@ -404,6 +404,7 @@ class ParallelTransformerLayer(MegatronModule):
            = args.apply_residual_connection_post_layernorm

        # Layernorm on the input data.
+        LayerNorm = import_layernorm(args.fp32_residual_connection)
        self.input_layernorm = LayerNorm(
            args.hidden_size,
            eps=args.layernorm_epsilon)
@@ -500,6 +501,8 @@ class ParallelTransformer(MegatronModule):
        super(ParallelTransformer, self).__init__()
        args = get_args()

+        self.fp32_residual_connection = args.fp32_residual_connection
+
        # Store activation checkpoiting flag.
        self.checkpoint_activations = args.checkpoint_activations
        self.checkpoint_num_layers = args.checkpoint_num_layers
@@ -520,6 +523,7 @@ class ParallelTransformer(MegatronModule):

        if mpu.is_pipeline_last_stage():
            # Final layer norm before output.
+            LayerNorm = import_layernorm(args.fp32_residual_connection)
            self.final_layernorm = LayerNorm(
                args.hidden_size,
                eps=args.layernorm_epsilon)
@@ -564,7 +568,12 @@ class ParallelTransformer(MegatronModule):

        if mpu.is_pipeline_first_stage():
            # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
-            hidden_states = hidden_states.transpose(0, 1).contiguous()
+            # If the input flag for fp32 residual connection is set, convert for float.
+            if self.fp32_residual_connection:
+                hidden_states = hidden_states.transpose(0, 1).contiguous().float()
+            # Otherwise, leave it as is.
+            else:
+                hidden_states = hidden_states.transpose(0, 1).contiguous()

        if self.checkpoint_activations:
            hidden_states = self._checkpointed_forward(hidden_states,

--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -19,8 +19,8 @@ import math

 import torch

-from .transformer import LayerNorm
-
+from megatron import get_args
+from megatron.model import import_layernorm

 def init_method_normal(sigma):
    """Init method based on N(0, sigma)."""
@@ -65,6 +65,10 @@ def get_params_for_weight_decay_optimization(module):
    """Divide params into with-weight-decay and without-weight-decay groups.
    Layernorms and baises will have no weight decay but the rest will.
    """
+
+    args = get_args()
+    LayerNorm = import_layernorm(args.fp32_residual_connection)
+    
    weight_decay_params = {'params': []}
    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
    for module_ in module.modules():

--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -41,7 +41,6 @@ from .initialize import get_pipeline_model_parallel_world_size, set_pipeline_mod
 from .initialize import initialize_model_parallel
 from .initialize import model_parallel_is_initialized

-from .layers import LayerNorm
 from .layers import ColumnParallelLinear
 from .layers import RowParallelLinear
 from .layers import VocabParallelEmbedding

--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -25,16 +25,6 @@ import torch.nn.functional as F
 import torch.nn.init as init
 from torch.nn.parameter import Parameter

-try:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as LayerNorm
-    # Try to use FusedLayerNorm from Apex - this will trigger an error.
-    _ = LayerNorm(8, eps=1e-5)
-
-except Exception as e:
-    print('WARNING: APEX is not installed, using torch.nn.LayerNorm '
-          'instead of apex.normalization.FusedLayerNorm!')
-    from torch.nn import LayerNorm
-
 from .initialize import get_tensor_model_parallel_rank
 from .initialize import get_tensor_model_parallel_world_size
 from .mappings import copy_to_tensor_model_parallel_region

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -333,16 +333,19 @@ def communicate(tensor_send_next, tensor_send_prev, recv_forward, recv_backward)
    tensor_recv_prev = None
    tensor_recv_next = None
    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+    dtype = args.params_dtype
+    if args.fp32_residual_connection:
+        dtype = torch.float
    if recv_forward:
        tensor_recv_prev = torch.empty(tensor_shape,
                                       requires_grad=True,
                                       device=torch.cuda.current_device(),
-                                       dtype=args.params_dtype)
+                                       dtype=dtype)
    if recv_backward:
        tensor_recv_next = torch.empty(tensor_shape,
                                       requires_grad=True,
                                       device=torch.cuda.current_device(),
-                                       dtype=args.params_dtype)
+                                       dtype=dtype)

    # Send tensors in both the forward and backward directions as appropriate.
    torch.distributed.ring_exchange(tensor_send_prev=tensor_send_prev,