Add Fast Multihead Attention to APEX Contrib (#697)

* Adding C++ Multihead Attention implementation to contrib. * Add reference test that at least works for forward. * Remove CublasLt support from multihead attention. * Add new Python version of self attention. * Update python model of MHA with backward pass. * Fixed Output Linear connection in MHA. * Clean up compiles and add documentation to PySelfAttention. * Add Encdec Python version of multihead attention. Cleanup files. * Tests for self and encdec multihead attention. * Add reference pytorch implementation of attention with norm and add. * Add cutlass branch definition. * Add cutlass download to compile. * Add norm/add tests. * Add biases to pytorch python versions. * Add tests and fix issues with python version of attention masking. * Create README.md * Update README.md * Update README.md * Update perf test parameters. * Update README.md * Update README.md * Update README.md * Add files via upload * Update README.md * Update README.md * Update README.md * Fix matmul1 output tensor size. Fix tests that missed issue.

Add Fast Multihead Attention to APEX Contrib (#697)
* Adding C++ Multihead Attention implementation to contrib. * Add reference test that at least works for forward. * Remove CublasLt support from multihead attention. * Add new Python version of self attention. * Update python model of MHA with backward pass. * Fixed Output Linear connection in MHA. * Clean up compiles and add documentation to PySelfAttention. * Add Encdec Python version of multihead attention. Cleanup files. * Tests for self and encdec multihead attention. * Add reference pytorch implementation of attention with norm and add. * Add cutlass branch definition. * Add cutlass download to compile. * Add norm/add tests. * Add biases to pytorch python versions. * Add tests and fix issues with python version of attention masking. * Create README.md * Update README.md * Update README.md * Update perf test parameters. * Update README.md * Update README.md * Update README.md * Add files via upload * Update README.md * Update README.md * Update README.md * Fix matmul1 output tensor size. Fix tests that missed issue.
3f94528e · Kevin Stephano · GitHub · 494f8ab3 · 3f94528e · ed2ed4d6
Unverified Commit 3f94528e authored Feb 06, 2020 by Kevin Stephano Committed by GitHub Feb 06, 2020
20 changed files
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "apex/contrib/csrc/multihead_attn/cutlass"]
+	path = apex/contrib/csrc/multihead_attn/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
+	branch = v1.2.0
--- a/cutlass @ ed2ed4d6
+++ b/cutlass @ ed2ed4d6
+Subproject commit ed2ed4d667ce95e1371bd62db32b6a114e774336
--- a/apex/contrib/csrc/multihead_attn/dropout.h
+++ b/apex/contrib/csrc/multihead_attn/dropout.h
+#include <ATen/ATen.h>
+#include <ATen/CUDAGenerator.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <curand_kernel.h>
+
+#include <THC/THCGeneral.h>
+
+const int UNROLL = 4;
+
+template <
+          typename scalar_t,
+          typename accscalar_t,
+          typename IndexType
+         >
+__global__ void apex_fused_dropout_kernel(scalar_t const                *inputs,
+                                          scalar_t                      *outputs,
+                                          uint8_t                       *mask,
+                                          IndexType                      totalElements, 
+		                                  accscalar_t                    p, 
+		                                  std::pair<uint64_t, uint64_t>  seeds
+                                         ) 
+{
+  accscalar_t pinv = accscalar_t(1)/p;
+  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  curandStatePhilox4_32_10_t state;
+  curand_init(
+      seeds.first,
+      idx,
+      seeds.second,
+      &state);
+
+  IndexType rounded_size = ((totalElements - 1)/(blockDim.x * gridDim.x * UNROLL)+1) * blockDim.x * gridDim.x * UNROLL;
+  for (IndexType linearIndex = idx;
+       linearIndex < rounded_size;
+       linearIndex += gridDim.x * blockDim.x*UNROLL) {
+       float4 rand = curand_uniform4(&state);
+       scalar_t src[UNROLL];
+       rand.x = rand.x < p;
+       rand.y = rand.y < p;
+       rand.z = rand.z < p;
+       rand.w = rand.w < p;
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+               src[ii] = inputs[li];
+           }
+       }
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+	           outputs[li] = src[ii]*static_cast<scalar_t>((&rand.x)[ii]*pinv);
+               mask[li]    = (uint8_t)(&rand.x)[ii];
+           }
+       }
+       __syncthreads();
+  }
+}
+
+template <
+          typename scalar_t,
+          typename accscalar_t,
+          typename IndexType
+         >
+__global__ void apex_dropout_add_kernel(scalar_t const                *inputs,
+                                        scalar_t const                *add_inputs,
+                                        scalar_t                      *outputs,
+                                        uint8_t                       *mask,
+                                        IndexType                      totalElements, 
+		                                accscalar_t                    p, 
+		                                std::pair<uint64_t, uint64_t>  seeds
+                                       ) 
+{
+  accscalar_t pinv = accscalar_t(1)/p;
+  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  curandStatePhilox4_32_10_t state;
+  curand_init(
+      seeds.first,
+      idx,
+      seeds.second,
+      &state);
+
+  IndexType rounded_size = ((totalElements - 1)/(blockDim.x * gridDim.x * UNROLL)+1) * blockDim.x * gridDim.x * UNROLL;
+  for (IndexType linearIndex = idx;
+       linearIndex < rounded_size;
+       linearIndex += gridDim.x * blockDim.x*UNROLL) {
+       float4 rand = curand_uniform4(&state);
+       scalar_t src[UNROLL];
+       scalar_t add_src[UNROLL];
+       rand.x = rand.x < p;
+       rand.y = rand.y < p;
+       rand.z = rand.z < p;
+       rand.w = rand.w < p;
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+               src[ii]     = inputs[li];
+               add_src[ii] = add_inputs[li];
+           }
+       }
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+	           accscalar_t int1 = static_cast<accscalar_t>((&rand.x)[ii]) * static_cast<accscalar_t>(src[ii]);
+	           accscalar_t int2 = int1 * static_cast<accscalar_t>(pinv);
+	           outputs[li] = static_cast<scalar_t>(static_cast<accscalar_t>(add_src[ii]) + int2);
+               mask[li]    = (uint8_t)(&rand.x)[ii];
+           }
+       }
+       __syncthreads();
+  }
+}
+
+template <
+          typename scalar_t,
+          typename accscalar_t,
+          typename IndexType
+         >
+__global__ void apex_add_kernel(          scalar_t const                *inputs,
+                                        scalar_t const                *add_inputs,
+                                        scalar_t                      *outputs,
+                                        IndexType                      totalElements
+                             ) 
+{
+  IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  IndexType rounded_size = ((totalElements - 1)/(blockDim.x * gridDim.x * UNROLL)+1) * blockDim.x * gridDim.x * UNROLL;
+  for (IndexType linearIndex = idx;
+       linearIndex < rounded_size;
+       linearIndex += gridDim.x * blockDim.x*UNROLL) {
+       scalar_t src[UNROLL];
+       scalar_t add_src[UNROLL];
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+               src[ii]     = inputs[li];
+               add_src[ii] = add_inputs[li];
+           }
+       }
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+	           outputs[li] = src[ii] + add_src[ii];
+           }
+       }
+       __syncthreads();
+  }
+}
+
+template<typename scalar_t, 
+		 typename accscalar_t, 
+		 typename IndexType
+		>
+__global__ void apex_masked_scale_kernel(scalar_t const *inputs, 
+                                         scalar_t       *outputs, 
+                                         uint8_t const  *mask, 
+                                         IndexType       totalElements,
+                                         accscalar_t     scale
+                                        )
+{
+  IndexType idx          = blockIdx.x * blockDim.x + threadIdx.x;
+  IndexType rounded_size = ((totalElements - 1)/(blockDim.x * gridDim.x * UNROLL)+1) * blockDim.x * gridDim.x * UNROLL;
+  for (IndexType linearIndex = idx;
+       linearIndex < rounded_size;
+       linearIndex += gridDim.x * blockDim.x*UNROLL) 
+  {
+       scalar_t src[UNROLL];
+       scalar_t msk[UNROLL];
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+               src[ii] = static_cast<scalar_t>(inputs[li]);
+               msk[ii] = static_cast<scalar_t>(mask[li]);
+           }
+       }
+       for (int ii = 0; ii < UNROLL; ii++) {
+           IndexType li = linearIndex + blockDim.x * gridDim.x * ii;
+           if (li < totalElements) {
+               outputs[li] = static_cast<scalar_t>(src[ii]*static_cast<scalar_t>(scale)) * msk[ii];
+           }
+       }
+  }
+}
+
+template <
+          typename scalar_t,
+          typename accscalar_t,
+          typename IndexType
+         >
+void apex_fused_dropout_cuda(scalar_t const *inputs,
+                           scalar_t       *outputs,
+                           uint8_t        *mask,
+                           IndexType       totalElements, 
+		                   accscalar_t     p)
+{
+  auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+  
+  int block_size = 256;
+  dim3 dim_block(block_size);
+  dim3 grid((totalElements + block_size -1)/block_size);
+  unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
+  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);
+
+  //number of times random will be generated per thread, to offset philox counter in thc random state
+  int64_t counter_offset = ((totalElements - 1)/(block_size*grid.x*UNROLL)+1)*UNROLL;
+  std::pair<uint64_t, uint64_t> rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_engine_inputs(counter_offset);
+  }
+
+  apex_fused_dropout_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, outputs, mask, totalElements, p, rng_engine_inputs);
+  THCudaCheck(cudaGetLastError());
+}
+
+template <
+          typename scalar_t,
+          typename accscalar_t,
+          typename IndexType
+         >
+void apex_dropout_add_cuda(scalar_t const *inputs,
+                           scalar_t const *add_inputs,
+                           scalar_t       *outputs,
+                           uint8_t        *mask,
+                           IndexType       totalElements, 
+		                   accscalar_t     p)
+{
+  auto gen = at::cuda::detail::getDefaultCUDAGenerator();
+  
+  int block_size = 256;
+  dim3 dim_block(block_size);
+  dim3 grid((totalElements + block_size -1)/block_size);
+  unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
+  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);
+
+  //number of times random will be generated per thread, to offset philox counter in thc random state
+  int64_t counter_offset = ((totalElements - 1)/(block_size*grid.x*UNROLL)+1)*UNROLL;
+  std::pair<uint64_t, uint64_t> rng_engine_inputs;
+  {
+    // See Note [Acquire lock when using random generators]
+    std::lock_guard<std::mutex> lock(gen->mutex_);
+    rng_engine_inputs = gen->philox_engine_inputs(counter_offset);
+  }
+
+  apex_dropout_add_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, add_inputs, outputs, mask, totalElements, p, rng_engine_inputs);
+  THCudaCheck(cudaGetLastError());
+}
+
+template <
+          typename scalar_t,
+          typename accscalar_t,
+          typename IndexType
+         >
+void apex_add_cuda(scalar_t const *inputs,
+                   scalar_t const *add_inputs,
+                   scalar_t       *outputs,
+                   IndexType       totalElements
+		          )
+{
+  int block_size = 256;
+  dim3 dim_block(block_size);
+  dim3 grid((totalElements + block_size -1)/block_size);
+  unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
+  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);
+
+  apex_add_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, add_inputs, outputs, totalElements);
+  THCudaCheck(cudaGetLastError());
+}
+
+template<typename scalar_t, 
+         typename accscalar_t, 
+         typename IndexType
+        >
+void apex_masked_scale_cuda(scalar_t const *inputs, 
+                          scalar_t       *outputs, 
+                          uint8_t const  *mask, 
+                          IndexType       totalElements,
+                          accscalar_t     scale
+                         )
+{
+  int block_size = 256;
+  dim3 dim_block(block_size);
+  dim3 grid((totalElements + block_size -1)/block_size);
+  unsigned int blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor/block_size;
+  grid.x = std::min((unsigned int)at::cuda::getCurrentDeviceProperties()->multiProcessorCount * blocks_per_sm, grid.x);
+
+  apex_masked_scale_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, outputs, mask, totalElements, scale);
+  THCudaCheck(cudaGetLastError());
+}
+
+
--- a/apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp
+++ b/apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace encdec {
+namespace cublas_gemmex {
+
+std::vector<torch::Tensor> fwd_cuda(
+                               bool                 use_time_mask,  
+                               bool                 is_training,
+                               int                  heads,
+                               torch::Tensor const& inputs_q, 
+                               torch::Tensor const& inputs_kv, 
+                               torch::Tensor const& input_weights_q,
+                               torch::Tensor const& input_weights_kv,
+                               torch::Tensor const& output_weights,
+                               const uint8_t*       pad_mask,
+                               float                dropout_prob
+                                                  );
+std::vector<torch::Tensor> bwd_cuda(
+                               int                  heads,
+                               torch::Tensor const& output_grads, 
+                               torch::Tensor const& matmul2_results,
+                               torch::Tensor const& dropout_results,
+                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& input_lin_q_results,
+                               torch::Tensor const& input_lin_kv_results,
+                               torch::Tensor const& inputs_q, 
+                               torch::Tensor const& inputs_kv, 
+                               torch::Tensor const& input_weights_q,
+                               torch::Tensor const& input_weights_kv,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& dropout_mask,
+                               float                dropout_prob
+                                                  );
+
+// C++ interface
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> fwd(
+                               bool                 use_mask,
+                               bool                 use_time_mask,
+                               bool                 is_training,
+                               int                  heads,
+                               torch::Tensor const& inputs_q, 
+                               torch::Tensor const& inputs_kv, 
+                               torch::Tensor const& input_weights_q,
+                               torch::Tensor const& input_weights_kv,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& pad_mask,
+                               float                dropout_prob
+                                                 )
+{
+  AT_ASSERTM(inputs_q.dim()         == 3, "expected 3D tensor");
+  AT_ASSERTM(inputs_kv.dim()        == 3, "expected 3D tensor");
+  AT_ASSERTM(input_weights_q.dim()  == 2, "expected 2D tensor");
+  AT_ASSERTM(input_weights_kv.dim() == 2, "expected 2D tensor");
+  AT_ASSERTM(output_weights.dim()   == 2, "expected 2D tensor");
+
+  AT_ASSERTM(inputs_q.type().scalarType()         == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(inputs_kv.type().scalarType()        == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_weights_q.type().scalarType()  == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_weights_kv.type().scalarType() == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(output_weights.type().scalarType()   == at::ScalarType::Half, "Only HALF is supported");
+  
+  if (use_mask) {
+  	AT_ASSERTM(pad_mask.dim()                     == 2,                    "expected 2D tensor");
+  	AT_ASSERTM(pad_mask.type().scalarType()       == at::ScalarType::Byte, "Only BYTE is supported");
+  }
+  
+  return fwd_cuda(
+                                 use_time_mask,
+                                 is_training,
+                                 heads, 
+                                 inputs_q, 
+                                 inputs_kv, 
+                                 input_weights_q, 
+                                 input_weights_kv, 
+                                 output_weights, 
+                                 use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr, 
+                                 dropout_prob
+                                );
+}
+
+std::vector<torch::Tensor> bwd(
+                               int                  heads,
+                               torch::Tensor const& output_grads, 
+                               torch::Tensor const& matmul2_results,
+                               torch::Tensor const& dropout_results,
+                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& input_lin_q_results,
+                               torch::Tensor const& input_lin_kv_results,
+                               torch::Tensor const& inputs_q, 
+                               torch::Tensor const& inputs_kv, 
+                               torch::Tensor const& input_weights_q,
+                               torch::Tensor const& input_weights_kv,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& dropout_mask,
+                               float                dropout_prob
+                                                  )
+{
+  AT_ASSERTM(output_grads.dim()         == 3, "expected 3D tensor");
+  AT_ASSERTM(matmul2_results.dim()      == 3, "expected 3D tensor");
+  AT_ASSERTM(dropout_results.dim()      == 3, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim()      == 3, "expected 3D tensor");
+  AT_ASSERTM(input_lin_q_results.dim()  == 3, "expected 3D tensor");
+  AT_ASSERTM(input_lin_kv_results.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM(inputs_q.dim()             == 3, "expected 3D tensor");
+  AT_ASSERTM(inputs_kv.dim()            == 3, "expected 3D tensor");
+  AT_ASSERTM(input_weights_q.dim()      == 2, "expected 2D tensor");
+  AT_ASSERTM(input_weights_kv.dim()     == 2, "expected 2D tensor");
+  AT_ASSERTM(output_weights.dim()       == 2, "expected 2D tensor");
+  AT_ASSERTM(dropout_mask.dim()         == 3, "expected 3D tensor");
+  
+  AT_ASSERTM(output_grads.type().scalarType()         == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(matmul2_results.type().scalarType()      == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(dropout_results.type().scalarType()      == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(softmax_results.type().scalarType()      == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_lin_q_results.type().scalarType()  == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_lin_kv_results.type().scalarType() == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(inputs_q.type().scalarType()             == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(inputs_kv.type().scalarType()            == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_weights_q.type().scalarType()      == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_weights_kv.type().scalarType()     == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(output_weights.type().scalarType()       == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(dropout_mask.type().scalarType()         == at::ScalarType::Byte, "Only BYTE is supported");
+  
+  return bwd_cuda(
+                                 heads, 
+                                 output_grads,
+                                 matmul2_results,
+                                 dropout_results,
+                                 softmax_results, 
+                                 input_lin_q_results, 
+                                 input_lin_kv_results, 
+                                 inputs_q, 
+                                 inputs_kv, 
+                                 input_weights_q,
+                                 input_weights_kv,
+                                 output_weights,
+                                 dropout_mask, 
+                                 dropout_prob
+                                );
+}
+
+} // end namespace cublas_gemmex
+} // end namespace encdec 
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &multihead_attn::encdec::cublas_gemmex::fwd, "Encdec Multihead Attention Forward.");
+  m.def("backward", &multihead_attn::encdec::cublas_gemmex::bwd, "Encdec Multihead Attention Backward.");
+}
--- a/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu
--- a/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp
+++ b/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace encdec_norm_add {
+namespace cublas_gemmex {
+
+std::vector<torch::Tensor> fwd_cuda(
+                               bool                 use_time_mask,  
+                               bool                 is_training,
+                               int                  heads,
+                               torch::Tensor const& inputs_q, 
+                               torch::Tensor const& inputs_kv, 
+                               torch::Tensor const& lyr_nrm_gamma_weights,
+                               torch::Tensor const& lyr_nrm_beta_weights,
+                               torch::Tensor const& input_weights_q,
+                               torch::Tensor const& input_weights_kv,
+                               torch::Tensor const& output_weights,
+                               const uint8_t*       pad_mask,
+                               float                dropout_prob
+                                                  );
+
+std::vector<torch::Tensor> bwd_cuda(
+                               int                  heads,
+                               torch::Tensor const& output_grads, 
+                               torch::Tensor const& matmul2_results,
+                               torch::Tensor const& dropout_results,
+                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& input_lin_q_results,
+                               torch::Tensor const& input_lin_kv_results,
+                               torch::Tensor const& lyr_nrm_results,
+                               torch::Tensor const& lyr_nrm_mean,
+                               torch::Tensor const& lyr_nrm_invvar,
+                               torch::Tensor const& inputs_q, 
+                               torch::Tensor const& inputs_kv, 
+                               torch::Tensor const& lyr_nrm_gamma_weights,
+                               torch::Tensor const& lyr_nrm_beta_weights,
+                               torch::Tensor const& input_weights_q,
+                               torch::Tensor const& input_weights_kv,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& dropout_mask,
+                               torch::Tensor const& dropout_add_mask,
+                               float                dropout_prob
+                                                  );
+
+// C++ interface
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> fwd(
+                               bool                 use_mask,
+                               bool                 use_time_mask,
+                               bool                 is_training,
+                               int                  heads,
+                               torch::Tensor const& inputs_q, 
+                               torch::Tensor const& inputs_kv, 
+                               torch::Tensor const& lyr_nrm_gamma_weights,
+                               torch::Tensor const& lyr_nrm_beta_weights,
+                               torch::Tensor const& input_weights_q,
+                               torch::Tensor const& input_weights_kv,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& pad_mask,
+                               float                dropout_prob
+                                                 )
+{
+  AT_ASSERTM(inputs_q.dim()               == 3, "expected 3D tensor");
+  AT_ASSERTM(inputs_kv.dim()              == 3, "expected 3D tensor");
+  AT_ASSERTM(lyr_nrm_gamma_weights.dim()  == 1, "expected 1D tensor");
+  AT_ASSERTM(lyr_nrm_beta_weights.dim()   == 1, "expected 1D tensor");
+  AT_ASSERTM(input_weights_q.dim()        == 2, "expected 2D tensor");
+  AT_ASSERTM(input_weights_kv.dim()       == 2, "expected 2D tensor");
+  AT_ASSERTM(output_weights.dim()         == 2, "expected 2D tensor");
+
+  AT_ASSERTM(inputs_q.type().scalarType()              == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(inputs_kv.type().scalarType()             == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_gamma_weights.type().scalarType() == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_beta_weights.type().scalarType()  == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_weights_q.type().scalarType()       == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_weights_kv.type().scalarType()      == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(output_weights.type().scalarType()        == at::ScalarType::Half, "Only HALF is supported");
+  
+  if (use_mask) {
+    AT_ASSERTM(pad_mask.dim()                     == 2,                    "expected 2D tensor");
+    AT_ASSERTM(pad_mask.type().scalarType()       == at::ScalarType::Byte, "Only BYTE is supported");
+  }
+  
+  return fwd_cuda(
+                                 use_time_mask,
+                                 is_training,
+                                 heads, 
+                                 inputs_q, 
+                                 inputs_kv,
+								 lyr_nrm_gamma_weights,
+								 lyr_nrm_beta_weights,
+                                 input_weights_q, 
+                                 input_weights_kv, 
+                                 output_weights, 
+                                 use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr, 
+                                 dropout_prob
+                                );
+}
+
+std::vector<torch::Tensor> bwd(
+                               int                  heads,
+                               torch::Tensor const& output_grads, 
+                               torch::Tensor const& matmul2_results,
+                               torch::Tensor const& dropout_results,
+                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& input_lin_q_results,
+                               torch::Tensor const& input_lin_kv_results,
+                               torch::Tensor const& lyr_nrm_results,
+                               torch::Tensor const& lyr_nrm_mean,
+                               torch::Tensor const& lyr_nrm_invvar,
+                               torch::Tensor const& inputs_q, 
+                               torch::Tensor const& inputs_kv, 
+							   torch::Tensor const& lyr_nrm_gamma_weights,
+							   torch::Tensor const& lyr_nrm_beta_weights,
+                               torch::Tensor const& input_weights_q,
+                               torch::Tensor const& input_weights_kv,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& dropout_mask,
+                               torch::Tensor const& dropout_add_mask,
+                               float                dropout_prob
+                                                  )
+{
+  AT_ASSERTM(output_grads.dim()          == 3, "expected 3D tensor");
+  AT_ASSERTM(matmul2_results.dim()       == 3, "expected 3D tensor");
+  AT_ASSERTM(dropout_results.dim()       == 3, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim()       == 3, "expected 3D tensor");
+  AT_ASSERTM(input_lin_q_results.dim()   == 3, "expected 3D tensor");
+  AT_ASSERTM(input_lin_kv_results.dim()  == 3, "expected 3D tensor");
+  AT_ASSERTM(lyr_nrm_results.dim()       == 3, "expected 3D tensor");
+  AT_ASSERTM(lyr_nrm_mean.dim()          == 1, "expected 1D tensor");
+  AT_ASSERTM(lyr_nrm_invvar.dim()        == 1, "expected 1D tensor");
+  AT_ASSERTM(inputs_q.dim()              == 3, "expected 3D tensor");
+  AT_ASSERTM(inputs_kv.dim()             == 3, "expected 3D tensor");
+  AT_ASSERTM(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
+  AT_ASSERTM(lyr_nrm_beta_weights.dim()  == 1, "expected 1D tensor");
+  AT_ASSERTM(input_weights_q.dim()       == 2, "expected 2D tensor");
+  AT_ASSERTM(input_weights_kv.dim()      == 2, "expected 2D tensor");
+  AT_ASSERTM(output_weights.dim()        == 2, "expected 2D tensor");
+  AT_ASSERTM(dropout_mask.dim()          == 3, "expected 3D tensor");
+  AT_ASSERTM(dropout_add_mask.dim()      == 3, "expected 3D tensor");
+  
+  AT_ASSERTM(output_grads.type().scalarType()          == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(matmul2_results.type().scalarType()       == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(dropout_results.type().scalarType()       == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(softmax_results.type().scalarType()       == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(input_lin_q_results.type().scalarType()   == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(input_lin_kv_results.type().scalarType()  == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_results.type().scalarType()       == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_mean.type().scalarType()          == at::ScalarType::Float, "Only FLOAT is supported");
+  AT_ASSERTM(lyr_nrm_invvar.type().scalarType()        == at::ScalarType::Float, "Only FLOAT is supported");
+  AT_ASSERTM(inputs_q.type().scalarType()              == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(inputs_kv.type().scalarType()             == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_gamma_weights.type().scalarType() == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_beta_weights.type().scalarType()  == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(input_weights_q.type().scalarType()       == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(input_weights_kv.type().scalarType()      == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(output_weights.type().scalarType()        == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(dropout_mask.type().scalarType()          == at::ScalarType::Byte,  "Only BYTE is supported");
+  AT_ASSERTM(dropout_add_mask.type().scalarType()      == at::ScalarType::Byte,  "Only BYTE is supported");
+  
+  return bwd_cuda(
+                                 heads, 
+                                 output_grads,
+                                 matmul2_results,
+                                 dropout_results,
+                                 softmax_results, 
+                                 input_lin_q_results, 
+                                 input_lin_kv_results, 
+                                 lyr_nrm_results,
+                                 lyr_nrm_mean,
+                                 lyr_nrm_invvar,
+                                 inputs_q, 
+                                 inputs_kv, 
+								 lyr_nrm_gamma_weights,
+								 lyr_nrm_beta_weights,
+                                 input_weights_q,
+                                 input_weights_kv,
+                                 output_weights,
+                                 dropout_mask,
+                                 dropout_add_mask,
+                                 dropout_prob
+                                );
+}
+
+} // end namespace cublas_gemmex
+} // end namespace encdec_norm_add 
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &multihead_attn::encdec_norm_add::cublas_gemmex::fwd, "Encdec Multihead Attention Plus Layer Norm and Residual Add Forward.");
+  m.def("backward", &multihead_attn::encdec_norm_add::cublas_gemmex::bwd, "Encdec Multihead Attention Plus Layer Norm and Residual Add Backward.");
+}
+
--- a/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu
--- a/apex/contrib/csrc/multihead_attn/layer_norm.h
+++ b/apex/contrib/csrc/multihead_attn/layer_norm.h
--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace self {
+namespace cublas_gemmex {
+
+std::vector<torch::Tensor> fwd_cuda(
+                               bool                 use_time_mask,  
+                               bool                 is_training,
+                               int                  heads,
+                               torch::Tensor const& inputs, 
+                               torch::Tensor const& input_weights,
+                               torch::Tensor const& output_weights,
+                               const uint8_t*       pad_mask,
+                               float                dropout_prob
+                                                  );
+
+std::vector<torch::Tensor> bwd_cuda(
+                               int                  heads,
+                               torch::Tensor const& output_grads, 
+                               torch::Tensor const& matmul2_results,
+                               torch::Tensor const& dropout_results,
+                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& input_lin_results,
+                               torch::Tensor const& inputs, 
+                               torch::Tensor const& input_weights,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& dropout_mask,
+                               float                dropout_prob
+                                                  );
+
+// C++ interface
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> fwd(
+ 							   bool 				use_mask,
+                               bool                 use_time_mask,
+                               bool                 is_training,
+                               int                  heads,
+                               torch::Tensor const& inputs, torch::Tensor const& input_weights,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& pad_mask,
+                               float                dropout_prob
+                                                 )
+{
+  AT_ASSERTM(inputs.dim()         == 3, "expected 3D tensor");
+  AT_ASSERTM(input_weights.dim()  == 2, "expected 2D tensor");
+  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
+
+  AT_ASSERTM(inputs.type().scalarType()         == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_weights.type().scalarType()  == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half, "Only HALF is supported");
+
+  if (use_mask) {
+  	AT_ASSERTM(pad_mask.dim()                     == 2,                    "expected 2D tensor");
+  	AT_ASSERTM(pad_mask.type().scalarType()       == at::ScalarType::Byte, "Only BYTE is supported");
+  }
+  
+  return fwd_cuda(
+                                 use_time_mask,
+                                 is_training,
+                                 heads, 
+                                 inputs, 
+                                 input_weights, 
+                                 output_weights, 
+                                 use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr, 
+                                 dropout_prob
+                                );
+}
+
+std::vector<torch::Tensor> bwd(
+                               int                  heads,
+                               torch::Tensor const& output_grads, 
+                               torch::Tensor const& matmul2_results,
+                               torch::Tensor const& dropout_results,
+                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& input_lin_results,
+                               torch::Tensor const& inputs, 
+                               torch::Tensor const& input_weights,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& dropout_mask,
+                               float                dropout_prob
+                                                  )
+{
+  AT_ASSERTM(output_grads.dim()      == 3, "expected 3D tensor");
+  AT_ASSERTM(matmul2_results.dim()   == 3, "expected 3D tensor");
+  AT_ASSERTM(dropout_results.dim()   == 3, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim()   == 3, "expected 3D tensor");
+  AT_ASSERTM(input_lin_results.dim() == 3, "expected 3D tensor");
+  AT_ASSERTM(inputs.dim()            == 3, "expected 3D tensor");
+  AT_ASSERTM(input_weights.dim()     == 2, "expected 2D tensor");
+  AT_ASSERTM(output_weights.dim()    == 2, "expected 2D tensor");
+  AT_ASSERTM(dropout_mask.dim()      == 3, "expected 3D tensor");
+  
+  AT_ASSERTM(output_grads.type().scalarType()      == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(matmul2_results.type().scalarType()   == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(dropout_results.type().scalarType()   == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(softmax_results.type().scalarType()   == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_lin_results.type().scalarType() == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(inputs.type().scalarType()            == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_weights.type().scalarType()     == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(output_weights.type().scalarType()    == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(dropout_mask.type().scalarType()      == at::ScalarType::Byte, "Only BYTE is supported");
+  
+  return bwd_cuda(
+                                 heads, 
+                                 output_grads,
+                                 matmul2_results,
+                                 dropout_results,
+                                 softmax_results, 
+                                 input_lin_results, 
+                                 inputs, 
+                                 input_weights,
+                                 output_weights,
+                                 dropout_mask, 
+                                 dropout_prob
+                                );
+}
+
+} // end namespace cublas_gemmex
+} // end namespace self
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &multihead_attn::self::cublas_gemmex::fwd, "Self Multihead Attention Forward.");
+  m.def("backward", &multihead_attn::self::cublas_gemmex::bwd, "Self Multihead Attention Backward.");
+}
+
--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu
--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp
+#include <torch/extension.h>
+#include <vector>
+
+namespace multihead_attn {
+namespace self_norm_add {
+namespace cublas_gemmex {
+
+std::vector<torch::Tensor> fwd_cuda(
+                               bool                 use_time_mask,
+                               bool                 is_training,
+                               int                  heads,
+                               torch::Tensor const& inputs, 
+							   torch::Tensor const& lyr_nrm_gamma_weights,
+                               torch::Tensor const& lyr_nrm_beta_weights,
+                               torch::Tensor const& input_weights,
+                               torch::Tensor const& output_weights,
+                               const uint8_t*       pad_mask,
+                               float                dropout_prob
+                                                  );
+
+std::vector<torch::Tensor> bwd_cuda(
+                               int                  heads,
+                               torch::Tensor const& output_grads, 
+                               torch::Tensor const& matmul2_results,
+                               torch::Tensor const& dropout_results,
+                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& input_lin_results,
+                               torch::Tensor const& lyr_nrm_results,
+                               torch::Tensor const& lyr_nrm_mean,
+                               torch::Tensor const& lyr_nrm_invvar,
+                               torch::Tensor const& inputs, 
+                               torch::Tensor const& lyr_nrm_gamma_weights,
+                               torch::Tensor const& lyr_nrm_beta_weights,
+                               torch::Tensor const& input_weights,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& dropout_mask,
+                               torch::Tensor const& dropout_add_mask,
+                               float                dropout_prob
+                                                  );
+
+// C++ interface
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> fwd(
+                               bool                 use_mask,
+                               bool                 use_time_mask,
+                               bool                 is_training,
+                               int                  heads,
+                               torch::Tensor const& inputs, 
+							   torch::Tensor const& lyr_nrm_gamma_weights,
+							   torch::Tensor const& lyr_nrm_beta_weights,
+                               torch::Tensor const& input_weights,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& pad_mask,
+                               float                dropout_prob
+                                                 )
+{
+  AT_ASSERTM(inputs.dim()                 == 3, "expected 3D tensor");
+  AT_ASSERTM(lyr_nrm_gamma_weights.dim()  == 1, "expected 1D tensor");
+  AT_ASSERTM(lyr_nrm_beta_weights.dim()   == 1, "expected 1D tensor");
+  AT_ASSERTM(input_weights.dim()          == 2, "expected 2D tensor");
+  AT_ASSERTM(output_weights.dim()         == 2, "expected 2D tensor");
+
+  AT_ASSERTM(inputs.type().scalarType()                == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_gamma_weights.type().scalarType() == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_beta_weights.type().scalarType()  == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(input_weights.type().scalarType()         == at::ScalarType::Half, "Only HALF is supported");
+  AT_ASSERTM(output_weights.type().scalarType()        == at::ScalarType::Half, "Only HALF is supported");
+
+  if (use_mask) {
+  	AT_ASSERTM(pad_mask.dim()                     == 2,                    "expected 2D tensor");
+  	AT_ASSERTM(pad_mask.type().scalarType()       == at::ScalarType::Byte, "Only BYTE is supported");
+  }
+  
+  return fwd_cuda(
+                                 use_time_mask,
+                                 is_training,
+                                 heads, 
+                                 inputs,
+                                 lyr_nrm_gamma_weights,
+                                 lyr_nrm_beta_weights,
+                                 input_weights, 
+                                 output_weights, 
+                                 use_mask ? static_cast<const uint8_t*>(pad_mask.data_ptr()) : nullptr, 
+                                 dropout_prob
+                                );
+}
+
+
+std::vector<torch::Tensor> bwd(
+                               int                  heads,
+                               torch::Tensor const& output_grads, 
+                               torch::Tensor const& matmul2_results,
+                               torch::Tensor const& dropout_results,
+                               torch::Tensor const& softmax_results,
+                               torch::Tensor const& input_lin_results,
+                               torch::Tensor const& lyr_nrm_results,
+                               torch::Tensor const& lyr_nrm_mean,
+                               torch::Tensor const& lyr_nrm_invvar,
+                               torch::Tensor const& inputs, 
+							   torch::Tensor const& lyr_nrm_gamma_weights,
+							   torch::Tensor const& lyr_nrm_beta_weights,
+                               torch::Tensor const& input_weights,
+                               torch::Tensor const& output_weights,
+                               torch::Tensor const& dropout_mask,
+                               torch::Tensor const& dropout_add_mask,
+                               float                dropout_prob
+                                                  )
+{
+  AT_ASSERTM(output_grads.dim()          == 3, "expected 3D tensor");
+  AT_ASSERTM(matmul2_results.dim()       == 3, "expected 3D tensor");
+  AT_ASSERTM(dropout_results.dim()       == 3, "expected 3D tensor");
+  AT_ASSERTM(softmax_results.dim()       == 3, "expected 3D tensor");
+  AT_ASSERTM(input_lin_results.dim()     == 3, "expected 3D tensor");
+  AT_ASSERTM(lyr_nrm_results.dim()       == 3, "expected 3D tensor");
+  AT_ASSERTM(lyr_nrm_mean.dim()          == 1, "expected 1D tensor");
+  AT_ASSERTM(lyr_nrm_invvar.dim()        == 1, "expected 1D tensor");
+  AT_ASSERTM(inputs.dim()                == 3, "expected 3D tensor");
+  AT_ASSERTM(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
+  AT_ASSERTM(lyr_nrm_beta_weights.dim()  == 1, "expected 1D tensor");
+  AT_ASSERTM(input_weights.dim()         == 2, "expected 2D tensor");
+  AT_ASSERTM(output_weights.dim()        == 2, "expected 2D tensor");
+  AT_ASSERTM(dropout_mask.dim()          == 3, "expected 3D tensor");
+  AT_ASSERTM(dropout_add_mask.dim()      == 3, "expected 3D tensor");
+  
+  AT_ASSERTM(output_grads.type().scalarType()          == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(matmul2_results.type().scalarType()       == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(dropout_results.type().scalarType()       == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(softmax_results.type().scalarType()       == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(input_lin_results.type().scalarType()     == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_results.type().scalarType()       == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_mean.type().scalarType()          == at::ScalarType::Float, "Only FLOAT is supported");
+  AT_ASSERTM(lyr_nrm_invvar.type().scalarType()        == at::ScalarType::Float, "Only FLOAT is supported");
+  AT_ASSERTM(inputs.type().scalarType()                == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_gamma_weights.type().scalarType() == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(lyr_nrm_beta_weights.type().scalarType()  == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(input_weights.type().scalarType()         == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(output_weights.type().scalarType()        == at::ScalarType::Half,  "Only HALF is supported");
+  AT_ASSERTM(dropout_mask.type().scalarType()          == at::ScalarType::Byte,  "Only BYTE is supported");
+  AT_ASSERTM(dropout_add_mask.type().scalarType()      == at::ScalarType::Byte,  "Only BYTE is supported");
+  
+  return bwd_cuda(heads, 
+                                 output_grads,
+                                 matmul2_results,
+                                 dropout_results,
+                                 softmax_results, 
+                                 input_lin_results, 
+                                 lyr_nrm_results,
+                                 lyr_nrm_mean,
+                                 lyr_nrm_invvar,
+                                 inputs, 
+							     lyr_nrm_gamma_weights,
+								 lyr_nrm_beta_weights,
+                                 input_weights,
+                                 output_weights,
+                                 dropout_mask, 
+                                 dropout_add_mask,
+                                 dropout_prob
+                                );
+}
+
+} // end namespace cublas_gemmex
+} // end namespace self_norm_add 
+} // end namespace multihead_attn
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &multihead_attn::self_norm_add::cublas_gemmex::fwd, "Self Multihead Attention Plus Layer Norm and Residual Add Forward.");
+  m.def("backward", &multihead_attn::self_norm_add::cublas_gemmex::bwd, "Self Multihead Attention Plus Layer Norm and Residual Add Backward.");
+}
+
--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu
--- a/apex/contrib/csrc/multihead_attn/softmax.h
+++ b/apex/contrib/csrc/multihead_attn/softmax.h
--- a/apex/contrib/csrc/multihead_attn/strided_batched_gemm.h
+++ b/apex/contrib/csrc/multihead_attn/strided_batched_gemm.h
--- a/apex/contrib/examples/multihead_attn/perf_test_multihead_attn.py
+++ b/apex/contrib/examples/multihead_attn/perf_test_multihead_attn.py
--- a/apex/contrib/multihead_attn/MHA_bwd.png
+++ b/apex/contrib/multihead_attn/MHA_bwd.png
--- a/apex/contrib/multihead_attn/MHA_fwd.png
+++ b/apex/contrib/multihead_attn/MHA_fwd.png
--- a/apex/contrib/multihead_attn/README.md
+++ b/apex/contrib/multihead_attn/README.md
--- a/apex/contrib/multihead_attn/__init__.py
+++ b/apex/contrib/multihead_attn/__init__.py
+from .self_multihead_attn import SelfMultiheadAttn
+from .encdec_multihead_attn import EncdecMultiheadAttn
--- a/apex/contrib/multihead_attn/encdec_multihead_attn.py
+++ b/apex/contrib/multihead_attn/encdec_multihead_attn.py