Faster `--fast_multihead_attn` build (#1245)

* merge .so files * odr * fix build * update import * apply psf/black with max line length of 120 * update * fix * update * build fixed again but undefined symbol again * fix 2, still layer norm grad is undefined * remove unused cpp files * without layer_norm.cuh, import works * import fast_multihead_attn works... but why? Was unnecessary `#include "layer_norm.cuh"` was the culprit causing .shared objects not to be able to link `HostApplyLayerNorm` and `HostLayerNormGradient`? * clean up layer norm

Faster `--fast_multihead_attn` build (#1245)
* merge .so files * odr * fix build * update import * apply psf/black with max line length of 120 * update * fix * update * build fixed again but undefined symbol again * fix 2, still layer norm grad is undefined * remove unused cpp files * without layer_norm.cuh, import works * import fast_multihead_attn works... but why? Was unnecessary `#include "layer_norm.cuh"` was the culprit causing .shared objects not to be able to link `HostApplyLayerNorm` and `HostLayerNormGradient`? * clean up layer norm
7ec8ed67 · Masaki Kozuki · GitHub · ed94d0bb · 7ec8ed67 · ed94d0bb
Unverified Commit 7ec8ed67 authored Dec 14, 2021 by Masaki Kozuki Committed by GitHub Dec 14, 2021
20 changed files
--- a/apex/contrib/csrc/fmha/src/fmha_kernel.h
+++ b/apex/contrib/csrc/fmha/src/fmha_kernel.h
@@ -27,7 +27,7 @@

 #pragma once

-#include <multihead_attn/philox.h>
+#include <multihead_attn/philox.cuh>

 #include <fmha.h>
 #include <fmha/utils.h>

--- a/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp
+++ b/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp
-#include <cuda_fp16.h>
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace fused_softmax {
-namespace additive_mask_softmax_dropout {
-
-std::vector<torch::Tensor> fwd_cuda(bool is_training, int heads,
-                                    torch::Tensor const &input,
-                                    const half *pad_mask, float dropout_prob);
-
-torch::Tensor bwd_cuda(int heads, torch::Tensor const &output_grads,
-                       torch::Tensor const &softmax_results,
-                       torch::Tensor const &dropout_mask, float dropout_prob);
-
-// C++ interface
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-std::vector<torch::Tensor> fwd(bool use_mask, bool is_training, int heads,
-                               torch::Tensor const &input,
-                               torch::Tensor const &pad_mask,
-                               float dropout_prob) {
-  AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  if (use_mask) {
-    AT_ASSERTM(pad_mask.dim() == 2, "expected 2D tensor");
-    AT_ASSERTM(pad_mask.type().scalarType() == at::ScalarType::Half,
-               "Only BYTE is supported");
-  }
-
-  return fwd_cuda(is_training, heads, input,
-                  use_mask ? static_cast<const half *>(pad_mask.data_ptr())
-                           : nullptr,
-                  dropout_prob);
-}
-
-torch::Tensor bwd(bool use_mask, int heads, torch::Tensor const &output_grads,
-                  torch::Tensor const &softmax_results,
-                  torch::Tensor const &dropout_mask, float dropout_prob) {
-  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_mask.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(output_grads.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(softmax_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  //  AT_ASSERTM(dropout_mask.type().scalarType()      == at::ScalarType::Byte,
-  //  "Only BYTE is supported");
-
-  return bwd_cuda(heads, output_grads, softmax_results, dropout_mask,
-                  dropout_prob);
-}
-
-} // namespace additive_mask_softmax_dropout
-} // end namespace fused_softmax
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward",
-        &multihead_attn::fused_softmax::additive_mask_softmax_dropout::fwd,
-        "Self Multihead Attention masked softmax dropout -- Forward.");
-  m.def("backward",
-        &multihead_attn::fused_softmax::additive_mask_softmax_dropout::bwd,
-        "Self Multihead Attention masked softmax dropout -- Backward.");
-}
--- a/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu
@@ -11,8 +11,8 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>

-#include "dropout.h"
-#include "softmax.h"
+#include "dropout.cuh"
+#include "softmax.cuh"

 // symbol to be automatically resolved by PyTorch libs

@@ -27,7 +27,7 @@ std::vector<torch::Tensor> fwd_cuda(bool is_training, int heads,
  const int sequences = attn_batches / heads;
  const int q_seq_len = input.size(1);
  const int k_seq_len = q_seq_len;
-  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
+  // const int dropout_elems = attn_batches * q_seq_len * k_seq_len;

  // There is no reason to use more than one stream as every kernel is
  // sequentially dependent
@@ -86,7 +86,7 @@ torch::Tensor bwd_cuda(int heads, torch::Tensor const &output_grads,
  const int attn_batches = output_grads.size(0);
  const int q_seq_len = output_grads.size(1);
  const int k_seq_len = q_seq_len;
-  const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
+  // const int dropout_elems = attn_batches * q_seq_len * k_seq_len;
  // TODO: Streams can be used in Backprop but I haven't added more than one
  // in my first attempt to create the code
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();

--- a/apex/contrib/csrc/multihead_attn/dropout.h
+++ b/apex/contrib/csrc/multihead_attn/dropout.h
+#pragma once
 #include <ATen/ATen.h>

 #ifdef OLD_GENERATOR
@@ -9,7 +10,9 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <curand_kernel.h>

-const int UNROLL = 4;
+namespace {
+constexpr int UNROLL = 4;
+} // namespace

 template <typename scalar_t, typename accscalar_t, typename IndexType>
 __global__ void

--- a/apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp
+++ b/apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace encdec {
-namespace cublas_gemmex {
-
-std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training,
-                                    int heads, torch::Tensor const &inputs_q,
-                                    torch::Tensor const &inputs_kv,
-                                    torch::Tensor const &input_weights_q,
-                                    torch::Tensor const &input_weights_kv,
-                                    torch::Tensor const &output_weights,
-                                    const uint8_t *pad_mask,
-                                    float dropout_prob);
-std::vector<torch::Tensor> bwd_cuda(
-    int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_q_results,
-    torch::Tensor const &input_lin_kv_results, torch::Tensor const &inputs_q,
-    torch::Tensor const &inputs_kv, torch::Tensor const &input_weights_q,
-    torch::Tensor const &input_weights_kv, torch::Tensor const &output_weights,
-    torch::Tensor const &dropout_mask, float dropout_prob);
-
-// C++ interface
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-std::vector<torch::Tensor>
-fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
-    torch::Tensor const &inputs_q, torch::Tensor const &inputs_kv,
-    torch::Tensor const &input_weights_q, torch::Tensor const &input_weights_kv,
-    torch::Tensor const &output_weights, torch::Tensor const &pad_mask,
-    float dropout_prob) {
-  AT_ASSERTM(inputs_q.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(inputs_kv.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_weights_q.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(input_weights_kv.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-
-  AT_ASSERTM(inputs_q.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(inputs_kv.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights_q.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights_kv.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-
-  if (use_mask) {
-    AT_ASSERTM(pad_mask.dim() == 2, "expected 2D tensor");
-    AT_ASSERTM(pad_mask.type().scalarType() == at::ScalarType::Byte,
-               "Only BYTE is supported");
-  }
-
-  return fwd_cuda(use_time_mask, is_training, heads, inputs_q, inputs_kv,
-                  input_weights_q, input_weights_kv, output_weights,
-                  use_mask ? static_cast<const uint8_t *>(pad_mask.data_ptr())
-                           : nullptr,
-                  dropout_prob);
-}
-
-std::vector<torch::Tensor>
-bwd(int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_q_results,
-    torch::Tensor const &input_lin_kv_results, torch::Tensor const &inputs_q,
-    torch::Tensor const &inputs_kv, torch::Tensor const &input_weights_q,
-    torch::Tensor const &input_weights_kv, torch::Tensor const &output_weights,
-    torch::Tensor const &dropout_mask, float dropout_prob) {
-  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(matmul2_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_lin_q_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_lin_kv_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(inputs_q.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(inputs_kv.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_weights_q.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(input_weights_kv.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(dropout_mask.dim() == 3, "expected 3D tensor");
-
-  AT_ASSERTM(output_grads.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(matmul2_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(softmax_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_lin_q_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_lin_kv_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(inputs_q.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(inputs_kv.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights_q.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights_kv.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_mask.type().scalarType() == at::ScalarType::Byte,
-             "Only BYTE is supported");
-
-  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results,
-                  softmax_results, input_lin_q_results, input_lin_kv_results,
-                  inputs_q, inputs_kv, input_weights_q, input_weights_kv,
-                  output_weights, dropout_mask, dropout_prob);
-}
-
-} // end namespace cublas_gemmex
-} // end namespace encdec
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &multihead_attn::encdec::cublas_gemmex::fwd,
-        "Encdec Multihead Attention Forward.");
-  m.def("backward", &multihead_attn::encdec::cublas_gemmex::bwd,
-        "Encdec Multihead Attention Backward.");
-}
--- a/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu
@@ -11,10 +11,9 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>

-#include "dropout.h"
-#include "layer_norm.h"
-#include "softmax.h"
-#include "strided_batched_gemm.h"
+#include "dropout.cuh"
+#include "softmax.cuh"
+#include "strided_batched_gemm.cuh"

 namespace multihead_attn {
 namespace encdec {

--- a/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp
+++ b/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace encdec_norm_add {
-namespace cublas_gemmex {
-
-std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training,
-                                    int heads, torch::Tensor const &inputs_q,
-                                    torch::Tensor const &inputs_kv,
-                                    torch::Tensor const &lyr_nrm_gamma_weights,
-                                    torch::Tensor const &lyr_nrm_beta_weights,
-                                    torch::Tensor const &input_weights_q,
-                                    torch::Tensor const &input_weights_kv,
-                                    torch::Tensor const &output_weights,
-                                    const uint8_t *pad_mask,
-                                    float dropout_prob);
-
-std::vector<torch::Tensor> bwd_cuda(
-    int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_q_results,
-    torch::Tensor const &input_lin_kv_results,
-    torch::Tensor const &lyr_nrm_results, torch::Tensor const &lyr_nrm_mean,
-    torch::Tensor const &lyr_nrm_invvar, torch::Tensor const &inputs_q,
-    torch::Tensor const &inputs_kv, torch::Tensor const &lyr_nrm_gamma_weights,
-    torch::Tensor const &lyr_nrm_beta_weights,
-    torch::Tensor const &input_weights_q, torch::Tensor const &input_weights_kv,
-    torch::Tensor const &output_weights, torch::Tensor const &dropout_mask,
-    torch::Tensor const &dropout_add_mask, float dropout_prob);
-
-// C++ interface
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-std::vector<torch::Tensor>
-fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
-    torch::Tensor const &inputs_q, torch::Tensor const &inputs_kv,
-    torch::Tensor const &lyr_nrm_gamma_weights,
-    torch::Tensor const &lyr_nrm_beta_weights,
-    torch::Tensor const &input_weights_q, torch::Tensor const &input_weights_kv,
-    torch::Tensor const &output_weights, torch::Tensor const &pad_mask,
-    float dropout_prob) {
-  AT_ASSERTM(inputs_q.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(inputs_kv.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(lyr_nrm_beta_weights.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(input_weights_q.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(input_weights_kv.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-
-  AT_ASSERTM(inputs_q.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(inputs_kv.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_gamma_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_beta_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights_q.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights_kv.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-
-  if (use_mask) {
-    AT_ASSERTM(pad_mask.dim() == 2, "expected 2D tensor");
-    AT_ASSERTM(pad_mask.type().scalarType() == at::ScalarType::Byte,
-               "Only BYTE is supported");
-  }
-
-  return fwd_cuda(use_time_mask, is_training, heads, inputs_q, inputs_kv,
-                  lyr_nrm_gamma_weights, lyr_nrm_beta_weights, input_weights_q,
-                  input_weights_kv, output_weights,
-                  use_mask ? static_cast<const uint8_t *>(pad_mask.data_ptr())
-                           : nullptr,
-                  dropout_prob);
-}
-
-std::vector<torch::Tensor>
-bwd(int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_q_results,
-    torch::Tensor const &input_lin_kv_results,
-    torch::Tensor const &lyr_nrm_results, torch::Tensor const &lyr_nrm_mean,
-    torch::Tensor const &lyr_nrm_invvar, torch::Tensor const &inputs_q,
-    torch::Tensor const &inputs_kv, torch::Tensor const &lyr_nrm_gamma_weights,
-    torch::Tensor const &lyr_nrm_beta_weights,
-    torch::Tensor const &input_weights_q, torch::Tensor const &input_weights_kv,
-    torch::Tensor const &output_weights, torch::Tensor const &dropout_mask,
-    torch::Tensor const &dropout_add_mask, float dropout_prob) {
-  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(matmul2_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_lin_q_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_lin_kv_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(lyr_nrm_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(lyr_nrm_mean.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(lyr_nrm_invvar.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(inputs_q.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(inputs_kv.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(lyr_nrm_beta_weights.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(input_weights_q.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(input_weights_kv.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(dropout_mask.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_add_mask.dim() == 3, "expected 3D tensor");
-
-  AT_ASSERTM(output_grads.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(matmul2_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(softmax_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_lin_q_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_lin_kv_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_mean.type().scalarType() == at::ScalarType::Float,
-             "Only FLOAT is supported");
-  AT_ASSERTM(lyr_nrm_invvar.type().scalarType() == at::ScalarType::Float,
-             "Only FLOAT is supported");
-  AT_ASSERTM(inputs_q.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(inputs_kv.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_gamma_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_beta_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights_q.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights_kv.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_mask.type().scalarType() == at::ScalarType::Byte,
-             "Only BYTE is supported");
-  AT_ASSERTM(dropout_add_mask.type().scalarType() == at::ScalarType::Byte,
-             "Only BYTE is supported");
-
-  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results,
-                  softmax_results, input_lin_q_results, input_lin_kv_results,
-                  lyr_nrm_results, lyr_nrm_mean, lyr_nrm_invvar, inputs_q,
-                  inputs_kv, lyr_nrm_gamma_weights, lyr_nrm_beta_weights,
-                  input_weights_q, input_weights_kv, output_weights,
-                  dropout_mask, dropout_add_mask, dropout_prob);
-}
-
-} // end namespace cublas_gemmex
-} // end namespace encdec_norm_add
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &multihead_attn::encdec_norm_add::cublas_gemmex::fwd,
-        "Encdec Multihead Attention Plus Layer Norm and Residual Add Forward.");
-  m.def(
-      "backward", &multihead_attn::encdec_norm_add::cublas_gemmex::bwd,
-      "Encdec Multihead Attention Plus Layer Norm and Residual Add Backward.");
-}
--- a/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu
@@ -11,10 +11,10 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>

-#include "dropout.h"
-#include "layer_norm.h"
-#include "softmax.h"
-#include "strided_batched_gemm.h"
+#include "dropout.cuh"
+#include "layer_norm.cuh"
+#include "softmax.cuh"
+#include "strided_batched_gemm.cuh"

 namespace multihead_attn {
 namespace encdec_norm_add {

--- a/apex/contrib/csrc/multihead_attn/layer_norm.h
+++ b/apex/contrib/csrc/multihead_attn/layer_norm.h
-#include "ATen/ATen.h"
-#include "ATen/cuda/DeviceUtils.cuh"
-
+#pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/DeviceUtils.cuh>

+namespace {
 template <typename U>
 __device__ void cuWelfordOnlineSum(const U curr, U &mu, U &sigma2, U &count) {
  count = count + U(1);
@@ -203,11 +204,10 @@ __device__ void cuWelfordMuSigma2(const at::Half *__restrict__ vals,
  }
 }

-template <typename U> U rsqrt(U v) { return U(1) / sqrt(v); }
-template <> float rsqrt(float v) { return rsqrtf(v); }
-template <> double rsqrt(double v) { return rsqrt(v); }
+template <typename U> __device__ U rsqrt(U v) { return U(1) / sqrt(v); }
+template <> __device__ float rsqrt(float v) { return rsqrtf(v); }
+template <> __device__ double rsqrt(double v) { return rsqrt(v); }

-namespace {
 // This is the un-specialized struct.  Note that we prevent instantiation of
 // this struct by putting an undefined symbol in the function body so it won't
 // compile.
@@ -224,7 +224,6 @@ namespace {
 //  };
 // https://github.com/NVIDIA/apex/issues/246
 template <typename T> struct SharedMemory;
-
 template <> struct SharedMemory<float> {
  __device__ float *getPointer() {
    extern __shared__ float s_float[];
@@ -238,7 +237,6 @@ template <> struct SharedMemory<double> {
    return s_double;
  }
 };
-} // namespace

 template <typename T, typename U>
 __global__ void
@@ -457,6 +455,7 @@ cuComputeGradGammaBeta(const U *part_grad_gamma, const U *part_grad_beta,
  }
 }

+
 template <typename T, typename U>
 __global__ void
 cuComputeGradInput(const T *__restrict__ dout, const T *__restrict__ dout_resid,
@@ -634,3 +633,4 @@ void HostLayerNormGradient(const T *dout, const T *dout_resid, const U *mean,
      dout, dout_resid, static_cast<T *>(input.data_ptr()), n1, n2, mean,
      invvar, U(epsilon), gamma, grad_input);
 }
+} // namespace
--- a/apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp
+++ b/apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace fused_softmax {
-namespace mask_softmax_dropout {
-
-std::vector<torch::Tensor> fwd_cuda(bool is_training, int heads,
-                                    torch::Tensor const &input,
-                                    const uint8_t *pad_mask,
-                                    float dropout_prob);
-
-torch::Tensor bwd_cuda(int heads, torch::Tensor const &output_grads,
-                       torch::Tensor const &softmax_results,
-                       torch::Tensor const &dropout_mask,
-                       const uint8_t *padding_mask, float dropout_prob);
-
-// C++ interface
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-std::vector<torch::Tensor> fwd(bool use_mask, bool is_training, int heads,
-                               torch::Tensor const &input,
-                               torch::Tensor const &pad_mask,
-                               float dropout_prob) {
-  AT_ASSERTM(input.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-
-  if (use_mask) {
-    AT_ASSERTM(pad_mask.dim() == 2, "expected 2D tensor");
-    AT_ASSERTM(pad_mask.type().scalarType() == at::ScalarType::Byte,
-               "Only BYTE is supported");
-  }
-
-  return fwd_cuda(is_training, heads, input,
-                  use_mask ? static_cast<const uint8_t *>(pad_mask.data_ptr())
-                           : nullptr,
-                  dropout_prob);
-}
-
-torch::Tensor bwd(bool use_mask, int heads, torch::Tensor const &output_grads,
-                  torch::Tensor const &softmax_results,
-                  torch::Tensor const &dropout_mask,
-                  torch::Tensor const &padding_mask, float dropout_prob) {
-  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_mask.dim() == 3, "expected 3D tensor");
-
-  AT_ASSERTM(output_grads.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(softmax_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  //  AT_ASSERTM(dropout_mask.type().scalarType()      == at::ScalarType::Byte,
-  //  "Only BYTE is supported");
-
-  return bwd_cuda(heads, output_grads, softmax_results, dropout_mask,
-                  use_mask
-                      ? static_cast<const uint8_t *>(padding_mask.data_ptr())
-                      : nullptr,
-                  dropout_prob);
-}
-
-} // end namespace mask_softmax_dropout
-} // end namespace fused_softmax
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &multihead_attn::fused_softmax::mask_softmax_dropout::fwd,
-        "Self Multihead Attention masked softmax dropout -- Forward.");
-  m.def("backward", &multihead_attn::fused_softmax::mask_softmax_dropout::bwd,
-        "Self Multihead Attention masked softmax dropout -- Backward.");
-}
--- a/apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu
@@ -11,8 +11,8 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>

-#include "dropout.h"
-#include "softmax.h"
+#include "dropout.cuh"
+#include "softmax.cuh"

 namespace multihead_attn {
 namespace fused_softmax {

--- a/apex/contrib/csrc/multihead_attn/multihead_attn_frontend.cpp
+++ b/apex/contrib/csrc/multihead_attn/multihead_attn_frontend.cpp
--- a/apex/contrib/csrc/multihead_attn/philox.h
+++ b/apex/contrib/csrc/multihead_attn/philox.h
 #pragma once
 // Philox CUDA.

+namespace {
+
 class Philox {
 public:
  __device__ inline Philox(unsigned long long seed,
@@ -85,8 +87,10 @@ private:
  static const unsigned long kPhiloxSB = 0xCD9E8D57;
 };
 // Inverse of 2^32.
-#define M_RAN_INVM32 2.3283064e-10f
+constexpr float M_RAN_INVM32 = 2.3283064e-10f;
 __device__ __inline__ float4 uniform4(uint4 x) {
  return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32,
                     x.w * M_RAN_INVM32);
 }
+
+} // namespace
--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace self {
-namespace cublas_gemmex {
-
-std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training,
-                                    int heads, torch::Tensor const &inputs,
-                                    torch::Tensor const &input_weights,
-                                    torch::Tensor const &output_weights,
-                                    const uint8_t *pad_mask,
-                                    float dropout_prob);
-
-std::vector<torch::Tensor> bwd_cuda(
-    int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_results, torch::Tensor const &inputs,
-    torch::Tensor const &input_weights, torch::Tensor const &output_weights,
-    torch::Tensor const &dropout_mask, float dropout_prob);
-
-// C++ interface
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-std::vector<torch::Tensor>
-fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
-    torch::Tensor const &inputs, torch::Tensor const &input_weights,
-    torch::Tensor const &output_weights, torch::Tensor const &pad_mask,
-    float dropout_prob) {
-  AT_ASSERTM(inputs.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-
-  AT_ASSERTM(inputs.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-
-  if (use_mask) {
-    AT_ASSERTM(pad_mask.dim() == 2, "expected 2D tensor");
-    AT_ASSERTM(pad_mask.type().scalarType() == at::ScalarType::Byte,
-               "Only BYTE is supported");
-  }
-
-  return fwd_cuda(
-      use_time_mask, is_training, heads, inputs, input_weights, output_weights,
-      use_mask ? static_cast<const uint8_t *>(pad_mask.data_ptr()) : nullptr,
-      dropout_prob);
-}
-
-std::vector<torch::Tensor>
-bwd(int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_results, torch::Tensor const &inputs,
-    torch::Tensor const &input_weights, torch::Tensor const &output_weights,
-    torch::Tensor const &dropout_mask, float dropout_prob) {
-  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(matmul2_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_lin_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(inputs.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(dropout_mask.dim() == 3, "expected 3D tensor");
-
-  AT_ASSERTM(output_grads.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(matmul2_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(softmax_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_lin_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(inputs.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_mask.type().scalarType() == at::ScalarType::Byte,
-             "Only BYTE is supported");
-
-  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results,
-                  softmax_results, input_lin_results, inputs, input_weights,
-                  output_weights, dropout_mask, dropout_prob);
-}
-
-} // end namespace cublas_gemmex
-} // end namespace self
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &multihead_attn::self::cublas_gemmex::fwd,
-        "Self Multihead Attention Forward.");
-  m.def("backward", &multihead_attn::self::cublas_gemmex::bwd,
-        "Self Multihead Attention Backward.");
-}
--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace self_bias {
-namespace cublas_gemmex {
-
-std::vector<torch::Tensor>
-fwd_cuda(bool use_time_mask, bool is_training, int heads,
-         torch::Tensor const &inputs, torch::Tensor const &input_weights,
-         torch::Tensor const &output_weights, torch::Tensor const &input_biases,
-         torch::Tensor const &output_biases, const uint8_t *pad_mask,
-         float dropout_prob);
-
-std::vector<torch::Tensor> bwd_cuda(
-    int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_results, torch::Tensor const &inputs,
-    torch::Tensor const &input_weights, torch::Tensor const &output_weights,
-    // torch::Tensor const& input_biases,
-    // torch::Tensor const& output_biases,
-    torch::Tensor const &dropout_mask, float dropout_prob);
-
-// C++ interface
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-std::vector<torch::Tensor>
-fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
-    torch::Tensor const &inputs, torch::Tensor const &input_weights,
-    torch::Tensor const &output_weights, torch::Tensor const &input_biases,
-    torch::Tensor const &output_biases, torch::Tensor const &pad_mask,
-    float dropout_prob) {
-  AT_ASSERTM(inputs.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-
-  AT_ASSERTM(inputs.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-
-  if (use_mask) {
-    AT_ASSERTM(pad_mask.dim() == 2, "expected 2D tensor");
-    AT_ASSERTM(pad_mask.type().scalarType() == at::ScalarType::Byte,
-               "Only BYTE is supported");
-  }
-
-  return fwd_cuda(use_time_mask, is_training, heads, inputs, input_weights,
-                  output_weights, input_biases, output_biases,
-                  use_mask ? static_cast<const uint8_t *>(pad_mask.data_ptr())
-                           : nullptr,
-                  dropout_prob);
-}
-
-std::vector<torch::Tensor>
-bwd(int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_results, torch::Tensor const &inputs,
-    torch::Tensor const &input_weights, torch::Tensor const &output_weights,
-    torch::Tensor const &dropout_mask, float dropout_prob) {
-  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(matmul2_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_lin_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(inputs.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(dropout_mask.dim() == 3, "expected 3D tensor");
-
-  AT_ASSERTM(output_grads.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(matmul2_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(softmax_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_lin_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(inputs.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_mask.type().scalarType() == at::ScalarType::Byte,
-             "Only BYTE is supported");
-
-  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results,
-                  softmax_results, input_lin_results, inputs, input_weights,
-                  output_weights, dropout_mask, dropout_prob);
-}
-
-} // end namespace cublas_gemmex
-} // namespace self_bias
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &multihead_attn::self_bias::cublas_gemmex::fwd,
-        "Self Multihead Attention with Bias -- Forward.");
-  m.def("backward", &multihead_attn::self_bias::cublas_gemmex::bwd,
-        "Self Multihead Attention with Bias -- Backward.");
-}
--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp
-#include <cuda_fp16.h>
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace self_bias_additive_mask {
-namespace cublas_gemmex {
-
-std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training,
-                                    int heads, torch::Tensor const &inputs,
-                                    torch::Tensor const &input_weights,
-                                    torch::Tensor const &output_weights,
-                                    torch::Tensor const &input_biases,
-                                    torch::Tensor const &output_biases,
-                                    const half *pad_mask, float dropout_prob);
-
-std::vector<torch::Tensor> bwd_cuda(
-    int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    // torch::Tensor const& softmax_results,
-    torch::Tensor const &bmm1_results, torch::Tensor const &pad_mask,
-    torch::Tensor const &input_lin_results, torch::Tensor const &inputs,
-    torch::Tensor const &input_weights, torch::Tensor const &output_weights,
-    // torch::Tensor const& input_biases,
-    // torch::Tensor const& output_biases,
-    torch::Tensor const &dropout_mask, float dropout_prob);
-
-// C++ interface
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-std::vector<torch::Tensor>
-fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
-    torch::Tensor const &inputs, torch::Tensor const &input_weights,
-    torch::Tensor const &output_weights, torch::Tensor const &input_biases,
-    torch::Tensor const &output_biases, torch::Tensor const &pad_mask,
-    float dropout_prob) {
-  AT_ASSERTM(inputs.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-
-  AT_ASSERTM(inputs.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(use_mask, "no mask is not supported");
-
-  if (use_mask) {
-    AT_ASSERTM(pad_mask.dim() == 2, "expected 2D tensor");
-    AT_ASSERTM(pad_mask.type().scalarType() == at::ScalarType::Half,
-               "Only Half is supported");
-  }
-
-  return fwd_cuda(use_time_mask, is_training, heads, inputs, input_weights,
-                  output_weights, input_biases, output_biases,
-                  use_mask ? static_cast<const half *>(pad_mask.data_ptr())
-                           : nullptr,
-                  dropout_prob);
-}
-
-std::vector<torch::Tensor>
-bwd(int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &bmm1_results, torch::Tensor const &pad_mask,
-    torch::Tensor const &input_lin_results, torch::Tensor const &inputs,
-    torch::Tensor const &input_weights, torch::Tensor const &output_weights,
-    torch::Tensor const &dropout_mask, float dropout_prob) {
-  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(matmul2_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_lin_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(inputs.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(dropout_mask.dim() == 3, "expected 3D tensor");
-
-  AT_ASSERTM(output_grads.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(matmul2_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_lin_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(inputs.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_mask.type().scalarType() == at::ScalarType::Byte,
-             "Only BYTE is supported");
-
-  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results,
-                  bmm1_results, pad_mask, input_lin_results, inputs,
-                  input_weights, output_weights, dropout_mask, dropout_prob);
-}
-
-} // end namespace cublas_gemmex
-} // namespace self_bias_additive_mask
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &multihead_attn::self_bias_additive_mask::cublas_gemmex::fwd,
-        "Self Multihead Attention with Bias -- Forward.");
-  m.def("backward",
-        &multihead_attn::self_bias_additive_mask::cublas_gemmex::bwd,
-        "Self Multihead Attention with Bias -- Backward.");
-}
--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu
@@ -11,10 +11,9 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>

-#include "dropout.h"
-#include "layer_norm.h"
-#include "softmax.h"
-#include "strided_batched_gemm.h"
+#include "dropout.cuh"
+#include "softmax.cuh"
+#include "strided_batched_gemm.cuh"

 namespace multihead_attn {
 namespace self_bias_additive_mask {

--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu
@@ -11,10 +11,9 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>

-#include "dropout.h"
-#include "layer_norm.h"
-#include "softmax.h"
-#include "strided_batched_gemm.h"
+#include "dropout.cuh"
+#include "softmax.cuh"
+#include "strided_batched_gemm.cuh"

 namespace multihead_attn {
 namespace self_bias {

--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu
@@ -11,10 +11,9 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>

-#include "dropout.h"
-#include "layer_norm.h"
-#include "softmax.h"
-#include "strided_batched_gemm.h"
+#include "dropout.cuh"
+#include "softmax.cuh"
+#include "strided_batched_gemm.cuh"

 namespace multihead_attn {
 namespace self {

--- a/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp
+++ b/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp
-#include <torch/extension.h>
-#include <vector>
-
-namespace multihead_attn {
-namespace self_norm_add {
-namespace cublas_gemmex {
-
-std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training,
-                                    int heads, torch::Tensor const &inputs,
-                                    torch::Tensor const &lyr_nrm_gamma_weights,
-                                    torch::Tensor const &lyr_nrm_beta_weights,
-                                    torch::Tensor const &input_weights,
-                                    torch::Tensor const &output_weights,
-                                    const uint8_t *pad_mask,
-                                    float dropout_prob);
-
-std::vector<torch::Tensor> bwd_cuda(
-    int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_results,
-    torch::Tensor const &lyr_nrm_results, torch::Tensor const &lyr_nrm_mean,
-    torch::Tensor const &lyr_nrm_invvar, torch::Tensor const &inputs,
-    torch::Tensor const &lyr_nrm_gamma_weights,
-    torch::Tensor const &lyr_nrm_beta_weights,
-    torch::Tensor const &input_weights, torch::Tensor const &output_weights,
-    torch::Tensor const &dropout_mask, torch::Tensor const &dropout_add_mask,
-    float dropout_prob);
-
-// C++ interface
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
-  CHECK_CONTIGUOUS(x)
-
-std::vector<torch::Tensor>
-fwd(bool use_mask, bool use_time_mask, bool is_training, int heads,
-    torch::Tensor const &inputs, torch::Tensor const &lyr_nrm_gamma_weights,
-    torch::Tensor const &lyr_nrm_beta_weights,
-    torch::Tensor const &input_weights, torch::Tensor const &output_weights,
-    torch::Tensor const &pad_mask, float dropout_prob) {
-  AT_ASSERTM(inputs.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(lyr_nrm_beta_weights.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(input_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-
-  AT_ASSERTM(inputs.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_gamma_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_beta_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-
-  if (use_mask) {
-    AT_ASSERTM(pad_mask.dim() == 2, "expected 2D tensor");
-    AT_ASSERTM(pad_mask.type().scalarType() == at::ScalarType::Byte,
-               "Only BYTE is supported");
-  }
-
-  return fwd_cuda(
-      use_time_mask, is_training, heads, inputs, lyr_nrm_gamma_weights,
-      lyr_nrm_beta_weights, input_weights, output_weights,
-      use_mask ? static_cast<const uint8_t *>(pad_mask.data_ptr()) : nullptr,
-      dropout_prob);
-}
-
-std::vector<torch::Tensor>
-bwd(int heads, torch::Tensor const &output_grads,
-    torch::Tensor const &matmul2_results, torch::Tensor const &dropout_results,
-    torch::Tensor const &softmax_results,
-    torch::Tensor const &input_lin_results,
-    torch::Tensor const &lyr_nrm_results, torch::Tensor const &lyr_nrm_mean,
-    torch::Tensor const &lyr_nrm_invvar, torch::Tensor const &inputs,
-    torch::Tensor const &lyr_nrm_gamma_weights,
-    torch::Tensor const &lyr_nrm_beta_weights,
-    torch::Tensor const &input_weights, torch::Tensor const &output_weights,
-    torch::Tensor const &dropout_mask, torch::Tensor const &dropout_add_mask,
-    float dropout_prob) {
-  AT_ASSERTM(output_grads.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(matmul2_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(softmax_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(input_lin_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(lyr_nrm_results.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(lyr_nrm_mean.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(lyr_nrm_invvar.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(inputs.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(lyr_nrm_gamma_weights.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(lyr_nrm_beta_weights.dim() == 1, "expected 1D tensor");
-  AT_ASSERTM(input_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(output_weights.dim() == 2, "expected 2D tensor");
-  AT_ASSERTM(dropout_mask.dim() == 3, "expected 3D tensor");
-  AT_ASSERTM(dropout_add_mask.dim() == 3, "expected 3D tensor");
-
-  AT_ASSERTM(output_grads.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(matmul2_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(softmax_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_lin_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_results.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_mean.type().scalarType() == at::ScalarType::Float,
-             "Only FLOAT is supported");
-  AT_ASSERTM(lyr_nrm_invvar.type().scalarType() == at::ScalarType::Float,
-             "Only FLOAT is supported");
-  AT_ASSERTM(inputs.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_gamma_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(lyr_nrm_beta_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(input_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(output_weights.type().scalarType() == at::ScalarType::Half,
-             "Only HALF is supported");
-  AT_ASSERTM(dropout_mask.type().scalarType() == at::ScalarType::Byte,
-             "Only BYTE is supported");
-  AT_ASSERTM(dropout_add_mask.type().scalarType() == at::ScalarType::Byte,
-             "Only BYTE is supported");
-
-  return bwd_cuda(heads, output_grads, matmul2_results, dropout_results,
-                  softmax_results, input_lin_results, lyr_nrm_results,
-                  lyr_nrm_mean, lyr_nrm_invvar, inputs, lyr_nrm_gamma_weights,
-                  lyr_nrm_beta_weights, input_weights, output_weights,
-                  dropout_mask, dropout_add_mask, dropout_prob);
-}
-
-} // end namespace cublas_gemmex
-} // end namespace self_norm_add
-} // end namespace multihead_attn
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &multihead_attn::self_norm_add::cublas_gemmex::fwd,
-        "Self Multihead Attention Plus Layer Norm and Residual Add Forward.");
-  m.def("backward", &multihead_attn::self_norm_add::cublas_gemmex::bwd,
-        "Self Multihead Attention Plus Layer Norm and Residual Add Backward.");
-}