Merge

0bfb8300 · Thor Johnsen · 2619f1cb · cf50dc7c · 0bfb8300 · 0bfb8300
Commit 0bfb8300 authored May 08, 2020 by Thor Johnsen
20 changed files
--- a/apex/amp/_initialize.py
+++ b/apex/amp/_initialize.py
@@ -146,7 +146,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
    from .amp import init as amp_init
    optimizers_was_list = False
-    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in sys.modules and isinstance(optimizers, LARC)):
+    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
        optimizers = [optimizers]
    elif optimizers is None:
        optimizers = []

--- a/apex/amp/handle.py
+++ b/apex/amp/handle.py
@@ -87,7 +87,7 @@ def scale_loss(loss,
        yield loss
        return
-    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in sys.modules and isinstance(optimizers, LARC)):
+    if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
        optimizers = [optimizers]
    loss_scaler = _amp_state.loss_scalers[loss_id]

--- a/apex/contrib/csrc/multihead_attn/dropout.h
+++ b/apex/contrib/csrc/multihead_attn/dropout.h
 #include <ATen/ATen.h>
+#ifdef OLD_GENERATOR
 #include <ATen/CUDAGenerator.h>
+#else
+#include <ATen/CUDAGeneratorImpl.h>
+#endif
 #include <ATen/cuda/CUDAContext.h>
 #include <curand_kernel.h>
@@ -206,8 +212,13 @@ void apex_fused_dropout_cuda(scalar_t const *inputs,
  std::pair<uint64_t, uint64_t> rng_engine_inputs;
  {
    // See Note [Acquire lock when using random generators]
+#ifdef OLD_GENERATOR
    std::lock_guard<std::mutex> lock(gen->mutex_);
    rng_engine_inputs = gen->philox_engine_inputs(counter_offset);
+#else
+    std::lock_guard<std::mutex> lock(gen.mutex());
+    rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(counter_offset);
+#endif
  }
  apex_fused_dropout_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, outputs, mask, totalElements, p, rng_engine_inputs);
@@ -239,8 +250,13 @@ void apex_dropout_add_cuda(scalar_t const *inputs,
  std::pair<uint64_t, uint64_t> rng_engine_inputs;
  {
    // See Note [Acquire lock when using random generators]
+#ifdef OLD_GENERATOR
    std::lock_guard<std::mutex> lock(gen->mutex_);
    rng_engine_inputs = gen->philox_engine_inputs(counter_offset);
+#else
+    std::lock_guard<std::mutex> lock(gen.mutex());
+    rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)->philox_engine_inputs(counter_offset);
+#endif
  }
  apex_dropout_add_kernel<scalar_t, accscalar_t, IndexType><<<grid, dim_block, 0, at::cuda::getCurrentCUDAStream()>>>(inputs, add_inputs, outputs, mask, totalElements, p, rng_engine_inputs);

--- a/apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
+++ b/apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
@@ -1033,4 +1033,3 @@ void fused_maybe_adam_undo_cuda(
    }
    THCudaCheck(cudaGetLastError());
 }
--- a/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp
+++ b/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp
+#include <torch/extension.h>
+void multi_tensor_lamb_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int bias_correction,
+  const float weight_decay,
+  const int grad_averaging,
+  const int mode,
+  const float global_grad_norm,
+  const float max_grad_norm);
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+        m.def("lamb", &multi_tensor_lamb_cuda, "Computes and apply update for LAMB optimizer");
+}
--- a/apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu
+++ b/apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+#include <assert.h>
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+#define BLOCK_SIZE 512
+#define ILP 4
+typedef enum{
+  MOMENT_MODE_0   =0, // L2 regularization mode
+  MOMENT_MODE_1   =1  // Decoupled weight decay mode
+} adamMode_t;
+std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::optional<bool> per_tensor_python);
+using MATH_T = float;
+template<typename T>
+struct LAMBStage1Functor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<4>& tl,
+    const float beta1,
+    const float beta2,
+    const float beta3,
+    const float beta1_correction,
+    const float beta2_correction,
+    const float epsilon,
+    adamMode_t mode,
+    const float decay,
+    const float global_grad_norm,
+    const float max_global_grad_norm)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+    float clipped_global_grad_norm = global_grad_norm > max_global_grad_norm ? global_grad_norm / max_global_grad_norm : 1.0f;
+    T* g = (T*)tl.addresses[0][tensor_loc];
+    g += chunk_idx*chunk_size;
+    T* p = (T*)tl.addresses[1][tensor_loc];
+    p += chunk_idx*chunk_size;
+    T* m = (T*)tl.addresses[2][tensor_loc];
+    m += chunk_idx*chunk_size;
+    T* v = (T*)tl.addresses[3][tensor_loc];
+    v += chunk_idx*chunk_size;
+    n -= chunk_idx*chunk_size;
+    // see note in multi_tensor_scale_kernel.cu
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      MATH_T r_g[ILP];
+      MATH_T r_p[ILP];
+      MATH_T r_m[ILP];
+      MATH_T r_v[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_g[ii] = g[i];
+          // special ?optimization? for lamb stage 1
+          if (decay == 0) {
+            r_p[ii] = MATH_T(0);
+          }
+          else {
+            r_p[ii] = p[i];
+          }
+          r_m[ii] = m[i];
+          r_v[ii] = v[i];
+        } else {
+          r_g[ii] = MATH_T(0);
+          r_p[ii] = MATH_T(0);
+          r_m[ii] = MATH_T(0);
+          r_v[ii] = MATH_T(0);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        if (mode == MOMENT_MODE_0) {
+	  MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+	  // L2 on scaled grad
+          scaled_grad = scaled_grad + decay*r_p[ii];
+          r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+          r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+          r_p[ii] = next_m_unbiased / denom;
+        }
+        else {
+          MATH_T scaled_grad = r_g[ii] / clipped_global_grad_norm;
+          r_m[ii] = r_m[ii] * beta1 + beta3 * scaled_grad;
+          r_v[ii] = r_v[ii] * beta2 + (1-beta2) * scaled_grad * scaled_grad;
+          MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+          MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+          MATH_T denom = sqrtf(next_v_unbiased) + epsilon;
+          r_p[ii] = (next_m_unbiased/denom) + (decay*r_p[ii]);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          g[i] = r_p[ii];
+          m[i] = r_m[ii];
+          v[i] = r_v[ii];
+        }
+      }
+    }
+  }
+};
+// Step 2 reads in 'update' value and per-tensor param_norm and update_norm.
+// It computes new parameter value.
+template<typename T>
+struct LAMBStage2Functor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<2>& tl,
+    const float* per_tensor_param_norm,
+    const float* per_tensor_update_norm,
+    const float learning_rate,
+    const float decay)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+    MATH_T ratio = learning_rate;
+    // apply adaptive learning rate to parameters with non-zero weight decay
+    if (decay != 0.0) 
+    {
+      float param_norm = per_tensor_param_norm[tensor_num];
+      float update_norm = per_tensor_update_norm[tensor_num];
+      ratio = (update_norm != 0.0f && param_norm != 0.0f) ? learning_rate * (param_norm / update_norm) : learning_rate;
+    }
+    T* update = (T*)tl.addresses[0][tensor_loc];
+    update += chunk_idx*chunk_size;
+    T* p = (T*)tl.addresses[1][tensor_loc];
+    p += chunk_idx*chunk_size;
+    n -= chunk_idx*chunk_size;
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      MATH_T r_p[ILP];
+      MATH_T r_update[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+       	int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_p[ii] = p[i];
+          r_update[ii] = update[i];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+       	r_p[ii] = r_p[ii] - (ratio * r_update[ii]);
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          p[i] = r_p[ii];
+        }
+      }
+    }
+  }
+};
+void multi_tensor_lamb_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int bias_correction,
+  const float weight_decay,
+  const int grad_averaging,
+  const int mode,
+  const float global_grad_norm,
+  const float max_grad_norm)
+{
+  using namespace at;
+  // Master weight and 32bit momentum(potentially changing) is not handled by this
+  // So we assume every tensor are all in the same type
+  // Handle bias correction mode
+  float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
+  if (bias_correction == 1) {
+    bias_correction1 = 1 - std::pow(beta1, step);
+    bias_correction2 = 1 - std::pow(beta2, step);
+  }
+  // Handle grad averaging mode
+  float beta3 = 1.0f;
+  if (grad_averaging == 1) beta3 = 1 - beta1;
+  std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin()+1);
+  std::vector<std::vector<at::Tensor>> param_list(tensor_lists.begin()+1, tensor_lists.begin()+2);
+  // Compute per tensor param norm
+  auto param_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, param_list, true);
+  // We now in-place modify grad to store update before compute its norm
+  // Generally this is not a issue since people modify grad in step() method all the time
+  // We can also grab list of empty tensor to avoid this, but I'd like to save space/cpu code
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_1",
+      multi_tensor_apply<4>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        LAMBStage1Functor<scalar_t_0>(),
+        beta1,
+        beta2,
+        beta3, // 1-beta1 or 1 depends on averaging mode
+        bias_correction1,
+        bias_correction2,
+        epsilon,
+        (adamMode_t) mode,
+        weight_decay,
+        global_grad_norm,
+        max_grad_norm); )
+  // Compute update norms
+  auto update_norm_tuple = multi_tensor_l2norm_cuda(chunk_size, noop_flag, grad_list, true);
+  std::vector<std::vector<at::Tensor>> grad_param_list(tensor_lists.begin(), tensor_lists.begin()+2);
+  DISPATCH_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(), 0, "lamb_stage_2",
+      multi_tensor_apply<2>(
+        BLOCK_SIZE,
+        chunk_size,
+       	noop_flag,
+        grad_param_list,
+        LAMBStage2Functor<scalar_t_0>(),
+        std::get<1>(param_norm_tuple).DATA_PTR<float>(),
+        std::get<1>(update_norm_tuple).DATA_PTR<float>(),
+        lr,
+	weight_decay); )
+  AT_CUDA_CHECK(cudaGetLastError());
+}
--- a/apex/contrib/csrc/xentropy/xentropy_kernel.cu
+++ b/apex/contrib/csrc/xentropy/xentropy_kernel.cu
@@ -70,7 +70,6 @@
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -84,6 +83,8 @@
 #include "type_shim.h"
 #include "compat.h"
+#define ALIGN_BYTES 16
 using Tensor = at::Tensor;
 using TensorList = at::TensorList;
 using ScalarType = at::ScalarType;
@@ -123,7 +124,7 @@ const int max_threads = 1024;
 inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
  uint64_t block_size = 1;
  uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
-  while (block_size < max_block_size) block_size *= 2;
+  while (block_size < (max_block_size/2)) block_size *= 2;
  // Launch at least a single warp - the kernel assumes that.
  block_size = std::max(block_size, static_cast<uint64_t>(32));
  return dim3(block_size);
@@ -287,29 +288,40 @@ blockReduce(AccumT* smem,
 template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
 __device__ __forceinline__ AccumT
-ilpReduce(T* data,
+ilpReduce(int shift,
+          T* data,
          int size,
          const Reduction<T, AccumT>& r,
          AccumT defaultVal)
 {
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LoadT;
  AccumT threadVal = defaultVal;
  int offset = threadIdx.x;
+  // shift and do 1
+  if(shift > 0){
+    data -= shift;
+    size += shift;
+    if(threadIdx.x >= shift){
+      threadVal = r(threadVal, data[offset]);
+    }
+    size -= blockDim.x;
+    data += blockDim.x;
+  }
  int last = size % (ILP * blockDim.x);
-  // Body (unroll by ILP times)
+  T v[ILP];
-  for (; offset < size - last; offset += blockDim.x * ILP) {
+  LoadT* value = reinterpret_cast<LoadT*>(&v);
-    T tmp[ILP];
-#pragma unroll
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
-    for (int j = 0; j < ILP; ++j)
+    *value = reinterpret_cast<LoadT*>(data)[offset];
-      tmp[j] = data[offset + j * blockDim.x];
-#pragma unroll
+    for (int j = 0; j < ILP; ++j) {
-    for (int j = 0; j < ILP; ++j)
+      threadVal = r(threadVal, v[j]);
-      threadVal = r(threadVal, tmp[j]);
+    }
  }
+  offset = size - last + threadIdx.x;
  // Epilogue
  for (; offset < size; offset += blockDim.x)
    threadVal = r(threadVal, data[offset]);
@@ -319,7 +331,8 @@ ilpReduce(T* data,
 template <template<typename, typename> class Reduction1, template<typename, typename> class Reduction2, int ILP, typename T, typename AccumT>
 __device__ __forceinline__ void
-ilpReduce(T* data,
+ilpReduce(int shift,
+          T* data,
          int size,
          AccumT* reducVal1,
          const Reduction1<T, AccumT>& r1,
@@ -328,27 +341,38 @@ ilpReduce(T* data,
          const Reduction2<T, AccumT>& r2,
          AccumT defaultVal2)
 {
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LoadT;
  AccumT threadVal1 = defaultVal1;
  AccumT threadVal2 = defaultVal2;
  int offset = threadIdx.x;
+  // shift and do 1
+  if(shift > 0){
+    data -= shift;
+    size += shift;
+    if(threadIdx.x >= shift){
+      threadVal1 = r1(threadVal1, data[offset]);
+      threadVal2 = r2(threadVal2, data[offset]);
+    }
+    size -= blockDim.x;
+    data += blockDim.x;
+  }
  int last = size % (ILP * blockDim.x);
-  // Body (unroll by ILP times)
+  T v[ILP];
-  for (; offset < size - last; offset += blockDim.x * ILP) {
+  LoadT* value = reinterpret_cast<LoadT*>(&v);
-    T tmp[ILP];
-#pragma unroll
+  for (; offset * ILP < (size - last); offset += blockDim.x) {
-    for (int j = 0; j < ILP; ++j)
+    *value = reinterpret_cast<LoadT*>(data)[offset];
-      tmp[j] = data[offset + j * blockDim.x];
-#pragma unroll
    for (int j = 0; j < ILP; ++j) {
-      threadVal1 = r1(threadVal1, tmp[j]);
+      threadVal1 = r1(threadVal1, v[j]);
-      threadVal2 = r2(threadVal2, tmp[j]);
+      threadVal2 = r2(threadVal2, v[j]);
    }
  }
+  offset = size - last + threadIdx.x;
  // Epilogue
  for (; offset < size; offset += blockDim.x) {
    threadVal1 = r1(threadVal1, data[offset]);
@@ -375,17 +399,19 @@ cunn_SoftMaxXEntropyForward(
  // each block handles a sample in the mini-batch
  input += blockIdx.x * classes;
  //output += blockIdx.x * classes;
+  const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
  int64_t label = labels[blockIdx.x];
  // find the max and sum
  accscalar_t threadMax, threadSum, max_k, sum_k;
  ilpReduce<MaxFloat, AddFloat, ILP, scalar_t, accscalar_t>(
-      input, classes,
+    shift, input, classes,
    &threadMax, MaxFloat<scalar_t, accscalar_t>(),
    -at::numeric_limits<accscalar_t>::max(),
    &threadSum, AddFloat<scalar_t, accscalar_t>(),
    static_cast<accscalar_t>(0));
  blockReduce<Max, Add, accscalar_t>(
      sdata,
      &max_k, threadMax, Max<accscalar_t>(),
@@ -393,9 +419,7 @@ cunn_SoftMaxXEntropyForward(
      &sum_k, threadSum, Add<accscalar_t>(),
      static_cast<accscalar_t>(0));
-  // reduce all values
+  accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
-  accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
-      input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
  accscalar_t sumAll = blockReduce<Add, accscalar_t>(
      sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
@@ -411,10 +435,9 @@ cunn_SoftMaxXEntropyForward(
  }
 }
-template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
-__global__ void
+__device__ __forceinline__ void
-cunn_SoftMaxXEntropyBackward(
+apply(scalar_t *gradInput,
-    scalar_t *gradInput,
      scalar_t *logits,
      outscalar_t *max_log_sum_exp,
      outscalar_t *gradOutput,
@@ -422,9 +445,6 @@ cunn_SoftMaxXEntropyBackward(
      const float smoothing,
      int classes)
 {
-  gradInput += blockIdx.x * classes;
-  logits += blockIdx.x * classes;
  accscalar_t smooth_positives = 1.0 - smoothing;
  accscalar_t smooth_negatives = smoothing / classes;
  accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
@@ -433,6 +453,7 @@ cunn_SoftMaxXEntropyBackward(
  int offset = threadIdx.x;
  int last = classes % (ILP * blockDim.x);
  for (; offset < classes - last; offset += blockDim.x * ILP) {
    accscalar_t tmpLogits[ILP];
@@ -457,9 +478,99 @@ cunn_SoftMaxXEntropyBackward(
 }
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t>
+__device__ __forceinline__ void
+aligned_apply(int shift,
+              scalar_t *gradInput,
+              scalar_t *logits,
+              outscalar_t *max_log_sum_exp,
+              outscalar_t *gradOutput,
+              int64_t *labels,
+              const float smoothing,
+              int classes)
+{
+  accscalar_t smooth_positives = 1.0 - smoothing;
+  accscalar_t smooth_negatives = smoothing / classes;
+  accscalar_t tmpGradOutput = gradOutput[blockIdx.x];
+  int64_t label = labels[blockIdx.x];
+  accscalar_t coeff = max_log_sum_exp[blockIdx.x];
+  int offset = threadIdx.x;
+  // shift and do 1
+  if(shift > 0){
+    logits -= shift;
+    gradInput -= shift;
+    classes += shift;
+    if(threadIdx.x >= shift){
+      gradInput[offset] = tmpGradOutput * (std::exp(
+        static_cast<accscalar_t>(logits[offset]) - coeff) -
+        static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) *
+        smooth_positives - smooth_negatives);
+    }
+    classes -= blockDim.x;
+    gradInput += blockDim.x;
+    logits += blockDim.x;
+    shift -= blockDim.x;
+  }
+  int last = classes % (ILP * blockDim.x);
+  typedef typename std::aligned_storage<ILP*sizeof(scalar_t), ILP*alignof(scalar_t)>::type LoadT;
+  // input
+  scalar_t v[ILP];
+  LoadT* value = reinterpret_cast<LoadT*>(&v);
+  // output
+  scalar_t r[ILP];
+  LoadT* result = reinterpret_cast<LoadT*>(&r);
+  for (; offset * ILP < (classes - last); offset += blockDim.x) {
+    *value = reinterpret_cast<LoadT*>(logits)[offset];
+#pragma unroll
+    for (int j = 0; j < ILP; ++j) {
+      r[j] = tmpGradOutput * (std::exp(
+          static_cast<accscalar_t>(v[j]) - coeff) -
+          static_cast<accscalar_t>(((ILP * offset + j - shift) == label) ? 1 : 0) *
+          smooth_positives - smooth_negatives);
+    }
+    reinterpret_cast<LoadT*>(gradInput)[offset] = *result;
+  }
+  offset = classes - last + threadIdx.x;
+  for (; offset < classes; offset += blockDim.x)
+    gradInput[offset] = tmpGradOutput * (std::exp(
+        static_cast<accscalar_t>(logits[offset]) - coeff) -
+        static_cast<accscalar_t>(((offset - shift) == label) ? 1 : 0) *
+        smooth_positives - smooth_negatives);
+}
+template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
+__global__ void
+cunn_SoftMaxXEntropyBackward(
+    scalar_t *gradInput,
+    scalar_t *logits,
+    outscalar_t *max_log_sum_exp,
+    outscalar_t *gradOutput,
+    int64_t *labels,
+    const float smoothing,
+    int classes)
+{
+  gradInput += blockIdx.x * classes;
+  logits += blockIdx.x * classes;
+  // Do vectorized load/store when input/output have same alignment
+  const int shift = ((uint64_t)logits) % ALIGN_BYTES / sizeof(scalar_t);
+  const int shift_ = ((uint64_t)gradInput) % ALIGN_BYTES / sizeof(scalar_t);
+  if (shift == shift_){
+    aligned_apply<ILP, scalar_t, accscalar_t, outscalar_t>(shift, gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes);
+  }
+  else {
+    apply<ILP, scalar_t, accscalar_t, outscalar_t>(gradInput, logits, max_log_sum_exp, gradOutput, labels, smoothing, classes);
+  }
+}
 template<template<typename, typename, typename> class Epilogue>
 std::vector<Tensor> host_softmax_xentropy(
@@ -495,13 +606,13 @@ std::vector<Tensor> host_softmax_xentropy(
  // XXX: it assumes that inner_size == 1
  TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
-  const int ILP = 2;
  dim3 grid(outer_size);
-  dim3 block = SoftMax_getBlockSize(ILP, dim_size);
  using namespace at;
  DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "host_softmax_xentropy",
    using accscalar_t = at::acc_type<scalar_t_0, true>;
+    const int ILP = sizeof(float4)/sizeof(scalar_t_0);
+    dim3 block = SoftMax_getBlockSize(ILP, dim_size);
    if (!half_to_float) {
      cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
        <<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
@@ -564,12 +675,12 @@ Tensor host_softmax_xentropy_backward(
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
-  const int ILP = 2;
  dim3 grid(outer_size);
-  dim3 block = SoftMax_getBlockSize(ILP, dim_size);
  DISPATCH_FLOAT_AND_HALF(gI.scalar_type(), 0, "host_softmax_xentropy_backward",
    using accscalar_t = acc_type<scalar_t_0, true>;
+    const int ILP = sizeof(float4)/sizeof(scalar_t_0);
+    dim3 block = SoftMax_getBlockSize(ILP, dim_size);
    if (!half_to_float) {
      cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
       <<<grid, block, block.x * sizeof(accscalar_t), stream>>>(

--- a/apex/contrib/multihead_attn/self_multihead_attn_func.py
+++ b/apex/contrib/multihead_attn/self_multihead_attn_func.py
@@ -183,7 +183,7 @@ class SelfAttnFunc(torch.autograd.Function):
        values_grads   = torch.bmm(dropout_results.transpose(1,2), output_lin_grads, out=values_grads.transpose(0,1))
        # Mask and Scaling for Dropout (not a publically documented op)
-        dropout_grads = torch._masked_scale(matmul2_dgrad1, dropout_mask, dropout_prob_t[0])
+        dropout_grads = torch._masked_scale(matmul2_dgrad1, dropout_mask, 1.0/(1.0-dropout_prob_t[0]))
        # Softmax Grad (not a publically documented op)
        softmax_grads = torch._softmax_backward_data(dropout_grads, softmax_results, -1, softmax_results)

--- a/apex/contrib/optimizers/__init__.py
+++ b/apex/contrib/optimizers/__init__.py
 from .fp16_optimizer import FP16_Optimizer
 from .fused_adam import FusedAdam
+from .fused_lamb import FusedLAMB
--- a/apex/contrib/optimizers/fp16_optimizer.py
+++ b/apex/contrib/optimizers/fp16_optimizer.py
@@ -239,4 +239,5 @@ class FP16_Optimizer(object):
        # constructed in the same way as the one whose state_dict we are loading, the same master params
        # are guaranteed to exist, so we can just copy_() from the saved master params.
        for current, saved in zip(self.fp32_groups, state_dict['fp32_groups']):
-            current.data.copy_(saved.data)
+            for _current, _saved in zip(current, saved):
+                _current.data.copy_(_saved.data)
--- a/apex/contrib/optimizers/fused_lamb.py
+++ b/apex/contrib/optimizers/fused_lamb.py
+import torch
+import importlib
+import math
+from apex.multi_tensor_apply import multi_tensor_applier
+class FusedLAMB(torch.optim.Optimizer):
+    """Implements LAMB algorithm.
+    Currently GPU-only.  Requires Apex to be installed via
+    ``pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_lamb" ./``.
+    This version of fused LAMB implements 2 fusions.
+      * Fusion of the LAMB update's elementwise operations
+      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
+    :class:`apex.contrib.optimizers.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer::
+        opt = apex.contrib.optimizers.FusedLAMB(model.parameters(), lr = ....)
+        ...
+        opt.step()
+    :class:`apex.optimizers.FusedLAMB` may be used with or without Amp.  If you wish to use :class:`FusedLAMB` with Amp,
+    you may choose any ``opt_level``::
+        opt = apex.optimizers.FusedLAMB(model.parameters(), lr = ....)
+        model, opt = amp.initialize(model, opt, opt_level="O0" or "O1 or "O2")
+        ...
+        opt.step()
+    In general, ``opt_level="O1"`` is recommended.
+    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its norm. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            NOT SUPPORTED now! (default: False)
+        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
+            True for decoupled weight decay(also known as AdamW) (default: True)
+        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
+            calculating running averages of gradient. (default: True)
+        set_grad_none (bool, optional): whether set grad to None when zero_grad()
+            method is called. (default: True)
+        max_grad_norm (float, optional): value used to clip global grad norm
+            (default: 1.0)
+    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(self, params, lr=1e-3, bias_correction=True,
+                 betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01,
+                 amsgrad=False, adam_w_mode=True,
+                 grad_averaging=True, set_grad_none=True,
+                 max_grad_norm=1.0):
+        if amsgrad:
+            raise RuntimeError('FusedLAMB does not support the AMSGrad variant.')
+        defaults = dict(lr=lr, bias_correction=bias_correction,
+                        betas=betas, eps=eps, weight_decay=weight_decay,
+                        grad_averaging=grad_averaging,
+                        max_grad_norm=max_grad_norm)
+        super(FusedLAMB, self).__init__(params, defaults)
+        if multi_tensor_applier.available:
+            import amp_C
+            self.multi_tensor_l2norm=amp_C.multi_tensor_l2norm
+            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
+            fused_lamb_cuda = importlib.import_module("fused_lamb_cuda")
+            self.multi_tensor_lamb = fused_lamb_cuda.lamb
+        else:
+            raise RuntimeError('apex.contrib.optimizers.FusedLAMB requires cuda extensions')
+        self.adam_w_mode = 1 if adam_w_mode else 0
+        self.set_grad_none = set_grad_none
+    def zero_grad(self):
+        if self.set_grad_none:
+            for group in self.param_groups:
+                for p in group['params']:
+                    p.grad = None
+        else:
+            super(FusedLAMB, self).zero_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+        # create separate grad lists for fp32 and fp16 params
+        g_all_32, g_all_16 = [], []
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.dtype == torch.float32:
+                    g_all_32.append(p.grad.data)
+                elif p.dytpe == torch.float16:
+                    g_all_16.append(p.grad.data)
+                else:
+                    raise RuntimeError('FusedLAMB only support fp16 and fp32.')
+        g_norm_32, g_norm_16 = 0.0, 0.0
+        # compute grad norm for two lists
+        if len(g_all_32) > 0:
+            g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
+                                             self._dummy_overflow_buf,
+                                             [g_all_32], False)[0].item()
+        if len(g_all_16) > 0:
+            g_norm_16 = multi_tensor_applier(self.multi_tensor_l2norm,
+                                             self._dummy_overflow_buf,
+                                             [g_all_16], False)[0].item()
+        # blend two grad norms to get global grad norm
+        global_grad_norm = math.sqrt(g_norm_32 * g_norm_32 + g_norm_16 * g_norm_16)
+        max_grad_norm = self.defaults['max_grad_norm']
+        for group in self.param_groups:
+            bias_correction = 1 if group['bias_correction'] else 0
+            beta1, beta2 = group['betas']
+            grad_averaging = 1 if group['grad_averaging'] else 0
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+            # create lists for multi-tensor apply
+            g_16, p_16, m_16, v_16 = [], [], [], []
+            g_32, p_32, m_32, v_32 = [], [], [], []
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.grad.data.is_sparse:
+                    raise RuntimeError('FusedLAMB does not support sparse gradients, please consider SparseAdam instead')
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                if p.dtype == torch.float16:
+                    g_16.append(p.grad.data)
+                    p_16.append(p.data)
+                    m_16.append(state['exp_avg'])
+                    v_16.append(state['exp_avg_sq'])
+                elif p.dtype == torch.float32:
+                    g_32.append(p.grad.data)
+                    p_32.append(p.data)
+                    m_32.append(state['exp_avg'])
+                    v_32.append(state['exp_avg_sq'])
+                else:
+                    raise RuntimeError('FusedLAMB only support fp16 and fp32.')
+            if(len(g_16) > 0):
+                multi_tensor_applier(self.multi_tensor_lamb,
+                                     self._dummy_overflow_buf,
+                                     [g_16, p_16, m_16, v_16],
+                                     group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     group['step'],
+                                     bias_correction,
+                                     group['weight_decay'],
+                                     grad_averaging,
+                                     self.adam_w_mode,
+                                     global_grad_norm,
+                                     max_grad_norm)
+            if(len(g_32) > 0):
+                multi_tensor_applier(self.multi_tensor_lamb,
+                                     self._dummy_overflow_buf,
+                                     [g_32, p_32, m_32, v_32],
+                                     group['lr'],
+                                     beta1,
+                                     beta2,
+                                     group['eps'],
+                                     group['step'],
+                                     bias_correction,
+                                     group['weight_decay'],
+                                     grad_averaging,
+                                     self.adam_w_mode,
+                                     global_grad_norm,
+                                     max_grad_norm)
+        return loss
--- a/apex/mlp/__init__.py
+++ b/apex/mlp/__init__.py
+from .mlp import *
--- a/apex/mlp/mlp.py
+++ b/apex/mlp/mlp.py
+from copy import copy
+import math
+import torch
+from torch import nn
+import mlp_cuda
+from .. import amp
+class MlpFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, bias, activation, *args):
+        output = mlp_cuda.forward(bias, activation, args)
+        ctx.save_for_backward(*args)
+        ctx.outputs = output
+        ctx.bias = bias
+        ctx.activation = activation
+        return output[0]
+    @staticmethod
+    def backward(ctx, grad_o):
+        grads = mlp_cuda.backward(ctx.bias, ctx.activation, grad_o, ctx.outputs, ctx.saved_tensors)
+        del ctx.outputs
+        return (None, None, *grads)
+mlp_function = amp.half_function(MlpFunction.apply)
+class MLP(torch.nn.Module):
+    """Launch MLP in C++
+    Args:
+        mlp_sizes (list of int): MLP sizes. Example: [1024,1024,1024] will create 2 MLP layers with shape 1024x1024
+        bias (bool): Default True:
+        relu (bool): Default True
+    """
+    def __init__(self, mlp_sizes, bias=True, activation='relu'):
+        super(MLP, self).__init__()
+        self.num_layers = len(mlp_sizes) - 1
+        self.mlp_sizes = copy(mlp_sizes)
+        self.bias = 1 if bias else 0
+        if activation is 'none':
+            self.activation = 0
+        elif activation is 'relu':
+            self.activation = 1
+        elif activation is 'sigmoid':
+            self.activation = 2
+        else:
+            raise TypeError("activation must be relu or none.")
+        self.weights = []
+        self.biases = []
+        for i in range(self.num_layers):
+            w = torch.nn.Parameter(torch.empty(mlp_sizes[i+1], mlp_sizes[i]))
+            self.weights.append(w)
+            name = 'weight_{}'.format(i)
+            setattr(self, name, w)
+            if self.bias:
+                b = torch.nn.Parameter(torch.empty(mlp_sizes[i+1]))
+                self.biases.append(b)
+                name = 'bias_{}'.format(i)
+                setattr(self, name, b)
+        self.reset_parameters()
+    def reset_parameters(self):
+        for weight in self.weights:
+            dimsum = weight.size(0) + weight.size(1)
+            std = math.sqrt(2. / float(dimsum))
+            nn.init.normal_(weight, 0., std)
+        if self.bias:
+            for bias in self.biases:
+                std = math.sqrt(1. / float(bias.size(0)))
+                nn.init.normal_(bias, 0., std)
+    def forward(self, input):
+        return mlp_function(self.bias, self.activation, input, *self.weights, *self.biases)
+    def extra_repr(self):
+        s = F"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, activation={self.activation}"
+        return s
--- a/apex/parallel/LARC.py
+++ b/apex/parallel/LARC.py
@@ -37,7 +37,6 @@ class LARC(object):
    """
    def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8):
-        self.param_groups = optimizer.param_groups
        self.optim = optimizer
        self.trust_coefficient = trust_coefficient
        self.eps = eps
@@ -49,9 +48,21 @@ class LARC(object):
    def __setstate__(self, state):
        self.optim.__setstate__(state)
+    @property
+    def state(self):
+        return self.optim.state
    def __repr__(self):
        return self.optim.__repr__()
+    @property
+    def param_groups(self):
+        return self.optim.param_groups
+    @param_groups.setter
+    def param_groups(self, value):
+        self.optim.param_groups = value
    def state_dict(self):
        return self.optim.state_dict()

--- a/apex/pyprof/__init__.py
+++ b/apex/pyprof/__init__.py
 import warnings
-from . import nvtx
+from . import nvtx, prof
--- a/apex/pyprof/prof/__init__.py
+++ b/apex/pyprof/prof/__init__.py
+from . import data, prof
--- a/apex/pyprof/prof/utility.py
+++ b/apex/pyprof/prof/utility.py
@@ -9,7 +9,7 @@ class Utility(object):
 	@staticmethod
 	def typeToBytes(t):
-		if (t in ["uint8", "int8", "byte", "char"]):
+		if (t in ["uint8", "int8", "byte", "char", "bool"]):
 			return 1
 		elif (t in ["float16", "half", "int16", "short"]):
 			return 2
@@ -21,7 +21,7 @@ class Utility(object):
 	@staticmethod
 	def typeToString(t):
-		if (t in ["uint8", "byte", "char"]):
+		if (t in ["uint8", "byte", "char",]):
 			return "uint8"
 		elif (t in ["int8",]):
 			return "int8"
@@ -37,6 +37,8 @@ class Utility(object):
 			return "int64"
 		elif (t in ["float64", "double",]):
 			return "fp64"
+		elif (t in ["bool",]):
+			return "bool"
 		assert False
 	@staticmethod

--- a/csrc/mlp.cpp
+++ b/csrc/mlp.cpp
+#include <torch/extension.h>
+#include <torch/torch.h>
+#include <vector>
+#include <stdio.h>
+size_t get_mlp_reserved_space(int batch_size, int num_layers, const int* output_features);
+template <typename T>
+size_t get_mlp_bp_workspace_in_bytes(int batch_size, int num_layers, const int* output_features);
+template <typename T>
+int mlp_fp(
+    T* X,
+    int input_features,
+    int batch_size,
+    T** WPtr,
+    int num_layers,
+    int* output_features,
+    T** BPtr,
+    T* Y,
+    T* reserved_space,
+    int use_bias,
+    int activation);
+template <typename T>
+int mlp_bp(
+    T* X,
+    T* Y,
+    int input_features,
+    int batch_size,
+    T** WPtr,
+    int num_layers,
+    int* output_features,
+    T* dY,
+    T* reserved_space,
+    T* work_space,
+    T* dX,
+    T** dwPtr,
+    T** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation);
+std::vector<at::Tensor> mlp_forward(int use_bias, int activation, std::vector<at::Tensor> inputs) {
+  auto num_layers = inputs.size() - 1;
+  if (use_bias) {
+    // inputs contains (input, weights, biases)
+    num_layers /= 2;
+  }
+  auto batch_size = inputs[0].size(0);
+  auto input_features = inputs[0].size(1);
+  std::vector<int> output_features;
+  for (int i = 0; i < num_layers; i++) {
+    output_features.push_back(inputs[i + 1].size(0));
+  }
+  auto reserved_size = get_mlp_reserved_space(batch_size, num_layers, output_features.data());
+  // create output/workspace tensor
+  // TODO(deyuf): just get buffer?
+  auto out = at::empty({batch_size, output_features.back()}, inputs[0].type());
+  auto reserved_space = at::empty({reserved_size}, inputs[0].type());
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].type(), "mlp_forward", [&] {
+    std::vector<scalar_t*> w_ptr;
+    std::vector<scalar_t*> b_ptr;
+    for (int i = 0; i < num_layers; i++) {
+      w_ptr.push_back(inputs[i + 1].data_ptr<scalar_t>());
+      if (use_bias) {
+        b_ptr.push_back(inputs[i + 1 + num_layers].data_ptr<scalar_t>());
+      }
+    }
+    auto result = mlp_fp<scalar_t>(
+        inputs[0].data_ptr<scalar_t>(),
+        input_features,
+        batch_size,
+        w_ptr.data(),
+        num_layers,
+        output_features.data(),
+        b_ptr.data(),
+        out.data_ptr<scalar_t>(),
+        reserved_space.data_ptr<scalar_t>(),
+        use_bias,
+        activation);
+  });
+  return {out, reserved_space};
+}
+std::vector<at::Tensor> mlp_backward(
+  int use_bias,
+  int activation,
+  at::Tensor grad_o,
+  std::vector<at::Tensor> fprop_outputs,
+  std::vector<at::Tensor> inputs) {
+  auto num_layers = inputs.size() - 1;
+  if (use_bias) {
+    // inputs contains (input, weights, biases)
+    num_layers /= 2;
+  }
+  auto batch_size = inputs[0].size(0);
+  auto input_features = inputs[0].size(1);
+  // TODO: not creating empty tensor for it?
+  bool requires_grad = inputs[0].requires_grad();
+  std::vector<int> output_features;
+  for (int i = 0; i < num_layers; i++) {
+    output_features.push_back(inputs[i + 1].size(0));
+  }
+  // create outputs, length of inputs
+  // TODO: not create bias if not needed
+  std::vector<at::Tensor> outputs;
+  for (int i = 0; i < inputs.size(); i++) {
+    outputs.push_back(at::empty(inputs[i].sizes(), inputs[i].type()));  // clone for testing now
+  }
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].type(), "mlp_backward", [&] {
+    std::vector<scalar_t*> w_ptr;
+    for (int i = 0; i < num_layers; i++) {
+      w_ptr.push_back(inputs[i + 1].data_ptr<scalar_t>());
+    }
+    std::vector<scalar_t*> outputs_ptr;
+    for (int i = 0; i < inputs.size(); i++) {
+      outputs_ptr.push_back(outputs[i].data_ptr<scalar_t>());
+    }
+    auto work_size =
+        get_mlp_bp_workspace_in_bytes<scalar_t>(batch_size, num_layers, output_features.data());
+    // auto work_space = at::empty({work_size*4}, at::kByte);
+    auto work_space = at::empty({work_size / sizeof(scalar_t)}, inputs[0].type());
+    auto result = mlp_bp<scalar_t>(
+        inputs[0].data_ptr<scalar_t>(),
+        fprop_outputs[0].data_ptr<scalar_t>(),
+        input_features,
+        batch_size,
+        w_ptr.data(),
+        num_layers,
+        output_features.data(),
+        grad_o.contiguous().data_ptr<scalar_t>(),
+        fprop_outputs[1].data_ptr<scalar_t>(),
+        work_space.data_ptr<scalar_t>(),
+        outputs_ptr[0],
+        outputs_ptr.data() + 1,
+        outputs_ptr.data() + 1 + num_layers,
+        requires_grad,
+        use_bias,
+        activation);
+  });
+  return outputs;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &mlp_forward, "MLP forward");
+  m.def("backward", &mlp_backward, "MLP backward");
+}
--- a/csrc/mlp_cuda.cu
+++ b/csrc/mlp_cuda.cu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <torch/torch.h>
+/* Includes, cuda */
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+// constants for fused bias+relu kernel
+#define BIAS_RELU_FW_NTHREADS 128 // forward number of thread per block
+#define BIAS_RELU_BW_NTHREADS_X 32 // backward number of thread in feature dim
+#define BIAS_RELU_BW_NTHREADS_Y 16 // backward number of thread in batch dim
+#define BIAS_RELU_RED_PER_THREAD 16 // backward minimal reduction length per thread
+// move to a header later on
+#define ILP 4
+template<typename T>
+__host__ __device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, volatile T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+template<typename T>
+__device__ __forceinline__ void load_store(volatile T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
+// Keep ReLU in float only. When using half, cast to float before calling.
+__device__ __inline__ float relu(float a) {
+  float retf = max(a, 0.f);
+  return (retf);
+}
+// Keep Sigmoid in float only. When using half, cast to float before calling.
+__device__ __inline__ float sigmoid(float a) {
+  float retf = 1.f / (1.f + expf(-a));
+  return (retf);
+}
+// FP64 Wrapper around cublas GEMMEx
+cublasStatus_t mlp_gemm(
+    cublasHandle_t handle,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    float* alpha,
+    const double* A,
+    int lda,
+    const double* B,
+    int ldb,
+    const float* beta,
+    double* C,
+    int ldc) {
+  return cublasGemmEx(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      CUDA_R_64F,
+      lda,
+      B,
+      CUDA_R_64F,
+      ldb,
+      beta,
+      C,
+      CUDA_R_64F,
+      ldc,
+      CUDA_R_64F,
+      CUBLAS_GEMM_DEFAULT);
+}
+// FP32 Wrapper around cublas GEMMEx
+cublasStatus_t mlp_gemm(
+    cublasHandle_t handle,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    float* alpha,
+    const float* A,
+    int lda,
+    const float* B,
+    int ldb,
+    const float* beta,
+    float* C,
+    int ldc) {
+  return cublasGemmEx(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      CUDA_R_32F,
+      lda,
+      B,
+      CUDA_R_32F,
+      ldb,
+      beta,
+      C,
+      CUDA_R_32F,
+      ldc,
+      CUDA_R_32F,
+      CUBLAS_GEMM_DEFAULT);
+}
+// FP16 Tensor core wrapper around cublas GEMMEx
+cublasStatus_t mlp_gemm(
+    cublasHandle_t handle,
+    cublasOperation_t transa,
+    cublasOperation_t transb,
+    int m,
+    int n,
+    int k,
+    float* alpha,
+    const at::Half* A,
+    int lda,
+    const at::Half* B,
+    int ldb,
+    float* beta,
+    at::Half* C,
+    int ldc) {
+  return cublasGemmEx(
+      handle,
+      transa,
+      transb,
+      m,
+      n,
+      k,
+      alpha,
+      A,
+      CUDA_R_16F,
+      lda,
+      B,
+      CUDA_R_16F,
+      ldb,
+      beta,
+      C,
+      CUDA_R_16F,
+      ldc,
+      CUDA_R_32F,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+// Bias ADD. Assume input X is [features x batch size], column major.
+// Bias is one 'features' long vector, with implicit broadcast.
+template <typename T>
+__global__ void biasAdd_fprop(T *X, T *b, uint batch_size, uint features) {
+  T r_x[ILP];
+  T r_b[ILP];
+  if(is_aligned(X) && is_aligned(b) && features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      int row = tid % (features / ILP);
+      load_store(r_x, X, 0 , tid);
+      load_store(r_b, b, 0 , row);
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
+        r_x[ii] = bias_sum;
+      }
+      load_store(X, r_x, tid , 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          int row = tid % features;
+          r_x[ii] = X[idx];
+          r_b[ii] = b[row];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
+        r_x[ii] = bias_sum;
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          X[idx] = r_x[ii];
+        }
+      }
+    }
+  }
+}
+// Bias ADD + ReLU. Assume input X is [features x batch size], column major.
+// Activation support fuesed ReLU. Safe to call in-place.
+template <typename T>
+__global__ void biasAddRelu_fprop(T *X, T *b, uint batch_size, uint features) {
+  T r_x[ILP];
+  T r_b[ILP];
+  if(is_aligned(X) && is_aligned(b) && features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      int row = tid % (features / ILP);
+      load_store(r_x, X, 0 , tid);
+      load_store(r_b, b, 0 , row);
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
+        r_x[ii] = relu(bias_sum);
+      }
+      load_store(X, r_x, tid , 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          int row = tid % features;
+          r_x[ii] = X[idx];
+          r_b[ii] = b[row];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
+        r_x[ii] = relu(bias_sum);
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          X[idx] = r_x[ii];
+        }
+      }
+    }
+  }
+}
+// ReLU. Assume input X is [features x batch size], column major.
+// Safe to call in-place.
+template <typename T>
+__global__ void Relu_fprop(T *X, uint batch_size, uint features) {
+  T r_x[ILP];
+  if(is_aligned(X) && features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      load_store(r_x, X, 0 , tid);
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        r_x[ii] = relu(static_cast<float>(r_x[ii]));
+      }
+      load_store(X, r_x, tid , 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          r_x[ii] = X[idx];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        r_x[ii] = relu(static_cast<float>(r_x[ii]));
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          X[idx] = r_x[ii];
+        }
+      }
+    }
+  }
+}
+// Sigmoid. Assume input X is [features x batch size], column major.
+// Safe to call in-place.
+template <typename T>
+__global__ void Sigmoid_fprop(T *X, uint batch_size, uint features) {
+  T r_x[ILP];
+  if(is_aligned(X) && features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      load_store(r_x, X, 0 , tid);
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        r_x[ii] = sigmoid(static_cast<float>(r_x[ii]));
+      }
+      load_store(X, r_x, tid , 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          r_x[ii] = X[idx];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        r_x[ii] = sigmoid(static_cast<float>(r_x[ii]));
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          X[idx] = r_x[ii];
+        }
+      }
+    }
+  }
+}
+// ReLU. Assume input X is [features x batch size], column major.
+// Safe to call in-place.
+template <typename T>
+__global__ void Relu_bprop(T *dY, T *Y, uint batch_size, uint features, T *dX) {
+  T r_dy[ILP];
+  T r_y[ILP];
+  if(is_aligned(dY) &&
+     is_aligned(Y) &&
+     is_aligned(dX) &&
+     features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      load_store(r_dy, dY, 0 , tid);
+      load_store(r_y, Y, 0 , tid);
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        if ((float)r_y[ii] <= 0.f)
+          r_dy[ii] = 0;
+      }
+      load_store(dX, r_dy, tid, 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          r_dy[ii] = dY[idx];
+          r_y[ii] = Y[idx];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        if ((float)r_y[ii] <= 0.f)
+          r_dy[ii] = 0;
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          dX[idx] = r_dy[ii];
+        }
+      }
+    }
+  }
+}
+// Sigmoid. Assume input X is [features x batch size], column major.
+// Safe to call in-place.
+template <typename T>
+__global__ void Sigmoid_bprop(T *dY, T *Y, uint batch_size, uint features, T *dX) {
+  T r_dy[ILP];
+  T r_y[ILP];
+  if(is_aligned(dY) &&
+     is_aligned(Y) &&
+     is_aligned(dX) &&
+     features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      load_store(r_dy, dY, 0 , tid);
+      load_store(r_y, Y, 0 , tid);
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        float grad_out = r_dy[ii];
+        float out = r_y[ii];
+        float grad_i = out * ( 1.f - out) * grad_out;
+        r_dy[ii] = grad_i;
+      }
+      load_store(dX, r_dy, tid, 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          r_dy[ii] = dY[idx];
+          r_y[ii] = Y[idx];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        float grad_out = r_dy[ii];
+        float out = r_y[ii];
+        float grad_i = out * ( 1.f - out) * grad_out;
+        r_dy[ii] = grad_i;
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          dX[idx] = r_dy[ii];
+        }
+      }
+    }
+  }
+}
+// Compute grid size for pointwise backward kernel.
+// block_x/y is total elment being handled per block, not number of threads
+void get_biasAddRelu_bprop_grid_size(
+    int yfeat,
+    int batch_size,
+    int block_x,
+    int block_y,
+    int* grid_x,
+    int* grid_y) {
+  *grid_x = (yfeat + block_x - 1) / block_x;
+  // Get number of SMs for efficient reduction.
+  int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  // can switch to occupancy calculation. use 4 below now for sm_70
+  int max_blocks_y = num_SMs * 4 / (*grid_x);
+  // block_y should be from minimal work per thread
+  int nRedSplits = (batch_size + block_y - 1) / block_y;
+  // increase number of elem per thread redcution to not launch more than enough
+  // kernel adjust work, so here we just launch max block
+  *grid_y = std::min(nRedSplits, max_blocks_y);
+  return;
+}
+// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
+// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
+template <typename T, int UNROLL_FACTOR>
+__global__ void biasAdd_bprop(
+    T* dY,
+    int features,
+    int batch_size,
+    volatile float* intermediate,
+    int* semaphores,
+    T* db) {
+  // The feature that this thread is responsible for
+  int f = blockIdx.x * blockDim.x + threadIdx.x;
+  // Compute the span this thread is responsible for
+  // For this block
+  int b_chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
+  int b_nStart = blockIdx.y * b_chunkSize;
+  int b_nSpan = min(batch_size, b_nStart + b_chunkSize) - b_nStart;
+  // For this thread
+  int chunkSize = (b_chunkSize + blockDim.y - 1) / blockDim.y;
+  int nStart = threadIdx.y * chunkSize + b_nStart;
+  int nSpan = min(b_nStart + b_nSpan, nStart + chunkSize) - nStart;
+  volatile float* out = intermediate + blockIdx.y * features;
+  // Flag to trigger last reduction.
+  __shared__ bool isLastBlock;
+  // we know block size for now
+  __shared__ float smem[BIAS_RELU_BW_NTHREADS_X*BIAS_RELU_BW_NTHREADS_Y];
+  // Accumulate db in FP32 always
+  float db_local = 0;
+  if (f < features) {
+    int nidx = 0;
+    // Handle non-multiple of UNROLL_FACTOR residue
+    for (; nidx < nSpan % UNROLL_FACTOR; nidx++) {
+      int row, col, flat_idx;
+      row = f;
+      col = nStart + nidx;
+      flat_idx = col * features + row;
+      db_local += (float)dY[flat_idx];
+    }
+    // Handle meat of work
+    for (; (nidx + UNROLL_FACTOR - 1) < nSpan; nidx += UNROLL_FACTOR) {
+      int row, col, flat_idx;
+      row = f;
+      col = nStart + nidx;
+      flat_idx = col * features + row;
+#pragma unroll 4
+      for (int u = 0; u < UNROLL_FACTOR; u++) {
+        db_local += (float)dY[flat_idx];
+        flat_idx += features;
+      }
+    }
+    // naive block reduction on y-dim
+    int linear_idx = threadIdx.y * blockDim.x + threadIdx.x;
+    smem[linear_idx] = db_local;
+  }
+  __syncthreads();
+  if (f < features) {
+    if(threadIdx.y == 0) {
+      for(int yidx = 1; yidx < blockDim.y; yidx++){
+        db_local += smem[yidx * blockDim.x + threadIdx.x];
+      }
+      // block result is in db_local now for all threadIdx.y == 0
+      // Write out partial result
+      out[f] = db_local;
+    }
+  }
+  __threadfence();
+  __syncthreads();
+  // Increment semaphore and check if this is the last CTA in the grid_y dimension.
+  // Only thread (0,0) calls this
+  if (threadIdx.x == 0 && threadIdx.y == 0 && f < features) {
+    unsigned int sum_idx;
+    sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1);
+    isLastBlock = (sum_idx == (gridDim.y - 1));
+  }
+  __syncthreads();
+  db_local = 0;
+  // No block reduction for now, only thread (*,0) do grid reduction
+  if (isLastBlock && f < features) {
+    if(threadIdx.y == 0) {
+      for (int n = 0; n < gridDim.y; n++) {
+        int row, col;
+        row = f;
+        col = n;
+        db_local += (float)(intermediate[col * features + row]);
+      }
+      db[f] = (T)db_local;
+    }
+  }
+}
+// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
+// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
+template <typename T, int UNROLL_FACTOR>
+__global__ void biasAddRelu_bprop(
+    T* Y,
+    T* dY,
+    int features,
+    int batch_size,
+    T* dX,
+    volatile float* intermediate,
+    int* semaphores,
+    T* db) {
+  // The feature that this thread is responsible for
+  int f = blockIdx.x * blockDim.x + threadIdx.x;
+  // Compute the span this thread is responsible for
+  // For this block
+  int b_chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
+  int b_nStart = blockIdx.y * b_chunkSize;
+  int b_nSpan = min(batch_size, b_nStart + b_chunkSize) - b_nStart;
+  // For this thread
+  int chunkSize = (b_chunkSize + blockDim.y - 1) / blockDim.y;
+  int nStart = threadIdx.y * chunkSize + b_nStart;
+  int nSpan = min(b_nStart + b_nSpan, nStart + chunkSize) - nStart;
+  volatile float* out = intermediate + blockIdx.y * features;
+  // Flag to trigger last reduction.
+  __shared__ bool isLastBlock;
+  // we know block size for now
+  __shared__ float smem[BIAS_RELU_BW_NTHREADS_X*BIAS_RELU_BW_NTHREADS_Y];
+  // Accumulate db in FP32 always
+  float db_local = 0;
+  if (f < features) {
+    int nidx = 0;
+    // Handle non-multiple of UNROLL_FACTOR residue
+    for (; nidx < nSpan % UNROLL_FACTOR; nidx++) {
+      int row, col, flat_idx;
+      row = f;
+      col = nStart + nidx;
+      flat_idx = col * features + row;
+      T y_val = Y[flat_idx];
+      T dy_val = dY[flat_idx];
+      T dx_val;
+      if ((float)y_val > 0.f)
+        dx_val = dy_val;
+      else
+        dx_val = 0;
+      dX[flat_idx] = dx_val;
+      db_local += (float)dx_val;
+    }
+    // Handle meat of work
+    for (; (nidx + UNROLL_FACTOR - 1) < nSpan; nidx += UNROLL_FACTOR) {
+      int row, col, flat_idx;
+      row = f;
+      col = nStart + nidx;
+      flat_idx = col * features + row;
+#pragma unroll 4
+      for (int u = 0; u < UNROLL_FACTOR; u++) {
+        T y_val = Y[flat_idx];
+        T dy_val = dY[flat_idx];
+        T dx_val;
+        if ((float)y_val > 0.f)
+          dx_val = dy_val;
+        else
+          dx_val = 0;
+        dX[flat_idx] = dx_val;
+        db_local += (float)dx_val;
+        flat_idx += features;
+      }
+    }
+    // naive block reduction on y-dim
+    int linear_idx = threadIdx.y * blockDim.x + threadIdx.x;
+    smem[linear_idx] = db_local;
+  }
+  __syncthreads();
+  if (f < features) {
+    if(threadIdx.y == 0) {
+      for(int yidx = 1; yidx < blockDim.y; yidx++){
+        db_local += smem[yidx * blockDim.x + threadIdx.x];
+      }
+      // block result is in db_local now for all threadIdx.y == 0
+      // Write out partial result
+      out[f] = db_local;
+    }
+  }
+  __threadfence();
+  __syncthreads();
+  // Increment semaphore and check if this is the last CTA in the grid_y dimension.
+  // Only thread (0,0) calls this
+  if (threadIdx.x == 0 && threadIdx.y == 0 && f < features) {
+    unsigned int sum_idx;
+    sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1);
+    isLastBlock = (sum_idx == (gridDim.y - 1));
+  }
+  __syncthreads();
+  db_local = 0;
+  // No block reduction for now, only thread (*,0) do grid reduction
+  if (isLastBlock && f < features) {
+    if(threadIdx.y == 0) {
+      for (int n = 0; n < gridDim.y; n++) {
+        int row, col;
+        row = f;
+        col = n;
+        db_local += (float)(intermediate[col * features + row]);
+      }
+      db[f] = (T)db_local;
+    }
+  }
+}
+// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
+// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
+template <typename T, int UNROLL_FACTOR>
+__global__ void biasAddRelu_bprop_aligned(
+    T* Y,
+    T* dY,
+    int features,
+    int batch_size,
+    T* dX,
+    volatile float* intermediate,
+    int* semaphores,
+    T* db) {
+  // The feature that this thread is responsible for
+  int f = blockIdx.x * blockDim.x + threadIdx.x;
+  // Compute the span this thread is responsible for
+  // For this block
+  int b_chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
+  int b_nStart = blockIdx.y * b_chunkSize;
+  int b_nSpan = min(batch_size, b_nStart + b_chunkSize) - b_nStart;
+  // For this thread
+  int chunkSize = (b_chunkSize + blockDim.y - 1) / blockDim.y;
+  int nStart = threadIdx.y * chunkSize + b_nStart;
+  int nSpan = min(b_nStart + b_nSpan, nStart + chunkSize) - nStart;
+  volatile float* out = intermediate + blockIdx.y * features;
+  // Flag to trigger last reduction.
+  __shared__ bool isLastBlock;
+  // Accumulate db in FP32 always
+  float db_local[ILP];
+  T r_y[ILP];
+  T r_dy[ILP];
+#pragma unroll
+  for(int ii=0;ii<ILP;ii++){
+    db_local[ii] = 0.f;
+  }
+  // f always <= features in this case
+  //if (f < features) {
+  int nidx = 0;
+  // Handle non-multiple of UNROLL_FACTOR residue
+  for (; nidx < nSpan % UNROLL_FACTOR; nidx++) {
+    int row, col, flat_idx;
+    row = f;
+    col = nStart + nidx;
+    flat_idx = col * features / ILP + row;
+    load_store(r_y, Y, 0, flat_idx);
+    load_store(r_dy, dY, 0, flat_idx);
+#pragma unroll
+    for(int ii=0;ii<ILP;ii++){
+      if ((float)r_y[ii] <= 0.f)
+        r_dy[ii] = 0;
+      db_local[ii] += (float)r_dy[ii];
+    }
+    load_store(dX, r_dy, flat_idx, 0);
+  }
+  // Handle meat of work
+  for (; (nidx + UNROLL_FACTOR - 1) < nSpan; nidx += UNROLL_FACTOR) {
+    int row, col, flat_idx;
+    row = f;
+    col = nStart + nidx;
+    flat_idx = col * features / ILP + row; // total threads in x == features/ILP
+#pragma unroll
+    for (int u = 0; u < UNROLL_FACTOR; u++) {
+      load_store(r_y, Y, 0, flat_idx);
+      load_store(r_dy, dY, 0, flat_idx);
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        if ((float)r_y[ii] <= 0.f)
+          r_dy[ii] = 0;
+        db_local[ii] += (float)r_dy[ii];
+      }
+      load_store(dX, r_dy, flat_idx, 0);
+      flat_idx += features/ILP;
+    }
+  }
+  // we know block size for now
+  __shared__ float smem[BIAS_RELU_BW_NTHREADS_X*BIAS_RELU_BW_NTHREADS_Y*ILP];
+  // naive block reduction on y-dim
+  int linear_idx = threadIdx.y * blockDim.x + threadIdx.x;
+  float* smem_out = smem + ILP * linear_idx;
+#pragma unroll
+  for(int ii=0;ii<ILP;ii++){
+    smem_out[ii] = db_local[ii]; // reuse local dy buffer
+  }
+  __syncthreads();
+  if(threadIdx.y == 0) {
+    for(int yidx = 1; yidx < blockDim.y; yidx++){
+      float* smem_in = smem + ILP * (yidx * blockDim.x + threadIdx.x);
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        db_local[ii] += smem_in[ii]; // reuse local dy buffer
+      }
+    }
+    // block result is in db_local now for all threadIdx.y == 0
+    // TODO: maybe not useful early exit here
+    if(gridDim.y == 1) {
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        r_dy[ii] = db_local[ii]; // reuse local dy buffer
+      }
+      load_store(db, r_dy, f, 0);
+      return;
+    }
+    // Write out partial result
+    load_store(out, db_local, f, 0);
+  }
+  __threadfence();
+  __syncthreads();
+  // Increment semaphore and check if this is the last CTA in the grid_y dimension.
+  // Only thread (0,0) calls this
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    unsigned int sum_idx;
+    sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1);
+    isLastBlock = (sum_idx == (gridDim.y - 1));
+  }
+  __syncthreads();
+#pragma unroll
+  for(int ii=0;ii<ILP;ii++){
+    db_local[ii] = 0.f;
+  }
+  float r_db[ILP];
+  // No block reduction for now, only thread (*,0) do grid reduction
+  if (isLastBlock) {
+    if(threadIdx.y == 0){
+      for (int n = 0; n < gridDim.y; n++) {
+        int row, col;
+        row = f;
+        col = n;
+        load_store(r_db, intermediate, 0, col * features / ILP + row);
+#pragma unroll
+        for(int ii=0;ii<ILP;ii++){
+          db_local[ii] += r_db[ii];
+        }
+      }
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        r_dy[ii] = db_local[ii]; // reuse local dy buffer
+      }
+      load_store(db, r_dy, f, 0);
+    }
+  }
+}
+// Lists where the num_layers-1 intermediate Y buffers start in reserved space on fprop, starting
+// offset 0. The last Y value is, of course, stored in the user provided output buffer.
+void get_y_offsets(
+    int batch_size,
+    int num_layers,
+    const int* output_features,
+    int* y_start_offsets) {
+  y_start_offsets[0] = 0;
+  for (int i = 1; i < num_layers; i++) {
+    y_start_offsets[i] = y_start_offsets[i - 1] + batch_size * output_features[i - 1];
+  }
+}
+// Returns the reserved space (in elements) needed for the MLP
+size_t get_mlp_reserved_space(int batch_size, int num_layers, const int* output_features) {
+  size_t res_space = 0;
+  // Need to store output of every intermediate MLP - size equal to output_features[i] * batch_size
+  // for all 'i' in [0, num_layers-1)
+  for (int l = 0; l < num_layers; l++) {
+    res_space += output_features[l] * batch_size;
+  }
+  return res_space;
+}
+// Returns the size of all fprop activations combined
+size_t get_all_activations_size(int batch_size, int num_layers, const int* output_features) {
+  size_t acts_size = 0;
+  for (int l = 0; l < num_layers; l++) {
+    acts_size += output_features[l] * batch_size;
+  }
+  return acts_size;
+}
+#if 0
+// Returns the work space (in elements) needed for the MLP bprop.
+size_t get_mlp_bp_workspace (int batch_size, int num_layers, const int* output_features) {
+    /*
+       Workspace is partitioned as
+       DY_GEMMs : DX_GEMMs
+    */
+    size_t work_space = 0;
+    // Store each intermediate dY explicitly. Need 2 dYs per MLP layer (one for o/p
+    // of biasReLU_bp and one for o/p of dgrad GEMM).
+    work_space += 2*get_all_activations_size(batch_size, num_layers, output_features);
+    return work_space;
+}
+#endif
+// Scratch space needed for reductions in number of elements
+size_t get_reduction_scratch_space(int batch_size, int num_layers, const int* output_features) {
+  size_t max_scratch_space = 0;
+  // Loop over all layers to see which one needs the max scratch space
+  for (int l = 0; l < num_layers; l++) {
+    // need to find max(aligned, not_aligned)
+    int tmp, res0, res1;
+    int block_x = BIAS_RELU_BW_NTHREADS_X;
+    int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+    get_biasAddRelu_bprop_grid_size(
+      output_features[l], batch_size, block_x, block_y, &tmp, &res0);
+    block_x = ILP * BIAS_RELU_BW_NTHREADS_X;
+    get_biasAddRelu_bprop_grid_size(
+      output_features[l], batch_size, block_x, block_y, &tmp, &res1);
+    max_scratch_space = std::max(max_scratch_space, (size_t)(output_features[l] * res0));
+    max_scratch_space = std::max(max_scratch_space, (size_t)(output_features[l] * res1));
+  }
+  return max_scratch_space;
+}
+// Buffer for semaphores
+size_t get_semaphores_size(int num_layers, const int* output_features) {
+  // Upper bound on semaphores is one per feature for the layer
+  // with the most features.
+  int max_features = 0;
+  for (int l = 0; l < num_layers; l++) {
+    max_features = std::max(max_features, output_features[l]);
+  }
+  return (size_t)max_features;
+}
+// Returns the work space (in elements) needed for the MLP bprop.
+template <typename T>
+size_t get_mlp_bp_workspace_in_bytes(int batch_size, int num_layers, const int* output_features) {
+  size_t work_space = 0;
+  // Store each intermediate dY explicitly. Need 2 dYs per MLP layer (one for o/p
+  // of biasReLU_bp and one for o/p of dgrad GEMM).
+  work_space += 2 * get_all_activations_size(batch_size, num_layers, output_features) * sizeof(T);
+  work_space +=
+      get_reduction_scratch_space(batch_size, num_layers, output_features) * sizeof(float);
+  work_space += get_semaphores_size(num_layers, output_features) * sizeof(int);
+  return work_space;
+}
+// Returns pointers to each segment of the workspace
+template <typename T>
+void partition_mlp_bp_workspace(
+    int batch_size,
+    int num_layers,
+    const int* output_features,
+    void* work_space,
+    T** dy_gemms,
+    T** dx_gemms,
+    float** db_scratch,
+    int** semaphores) {
+  /*
+     Workspace is partitioned as
+     DY_GEMMs : DX_GEMMs : DB_SCRATCH : SEMAPHORES
+  */
+  // Start address where dy_gemm tensors are stored
+  *dy_gemms = reinterpret_cast<T*>(work_space);
+  // Start address where dx_gemm tensors are stored
+  *dx_gemms = *dy_gemms + get_all_activations_size(batch_size, num_layers, output_features);
+  // Start address where db intermediate tensors are stored
+  *db_scratch = reinterpret_cast<float*>(
+      *dx_gemms + get_all_activations_size(batch_size, num_layers, output_features));
+  // Start address of semaphores
+  *semaphores = reinterpret_cast<int*>(
+      *db_scratch + get_reduction_scratch_space(batch_size, num_layers, output_features));
+  return;
+}
+// Does a simple MLP fprop (GEMM+bias+ReLU).
+// Can handle num_layers number of layers, each with its own shape. Output of layer i is assumed
+// to be input of layer i+1. output_features, WPtr and BPtr are arrays of length num_layers, and
+// must be in the same order i.e. WPtr[i] and BPtr[i] are respectively the weight and bias of layer
+// 'i'.
+template <typename T>
+int mlp_fp(
+    T* X,
+    int input_features,
+    int batch_size,
+    T** WPtr,
+    int num_layers,
+    int* output_features,
+    T** BPtr,
+    T* Y,
+    T* reserved_space,
+    int use_bias,
+    int activation) {
+  T *weight, *input, *output, *bias;
+  T *reserved_space_x, *reserved_space_y;
+  reserved_space_x = NULL;
+  reserved_space_y = reserved_space;
+  // Get cublas handle from Pytorch
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+  // Get the stream from cublas handle to reuse for biasReLU kernel.
+  cudaStream_t stream;
+  cublasGetStream(handle, &stream);
+  for (int layer = 0; layer < num_layers; layer++) {
+    weight = WPtr[layer];
+    input = (layer == 0) ? X : reserved_space_x;
+    output = (layer == num_layers - 1) ? Y : reserved_space_y;
+    if (use_bias) {
+      bias = BPtr[layer];
+    }
+    int ifeat = (layer == 0) ? input_features : output_features[layer - 1];
+    int ofeat = output_features[layer];
+    float one = 1.f;
+    float zero = 0.f;
+    cublasStatus_t cublas_status;
+    // Call GEMM: fprop is Y = W'X
+    cublas_status = mlp_gemm(
+        handle,
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        ofeat,
+        batch_size,
+        ifeat,
+        &one,
+        weight,
+        ifeat,
+        input,
+        ifeat,
+        &zero,
+        output,
+        ofeat);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+      printf("GEMM fprop failed with %d\n", cublas_status);
+      return 1;
+    }
+    const uint &input_size = ofeat;
+    int num_blocks = 0;
+    int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+    // Call biasReLU
+    if(use_bias == 1) {
+      if (activation == 0) { // no activation
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAdd_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        biasAdd_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, bias, batch_size, input_size);
+      } else if (activation == 1) { // relu
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAddRelu_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        biasAddRelu_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, bias, batch_size, input_size);
+      } else if (activation == 2) { // sigmoid
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAdd_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        biasAdd_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, bias, batch_size, input_size);
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Sigmoid_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, batch_size, input_size);
+      }
+    } else {
+      // don't need to do anything in case of no activation and no bias
+      if (activation == 1) { // relu
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Relu_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Relu_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, batch_size, input_size);
+      } else if (activation == 2) { // sigmoid
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Sigmoid_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, batch_size, input_size);
+      }
+    }
+    // Set current output as next layer input
+    reserved_space_x = reserved_space_y;
+    // Set next layer output
+    reserved_space_y += ofeat * batch_size;
+  }
+  return 0;
+}
+// Does a simple MLP bprop (GEMM+bias+ReLU).
+// Needs reserved space to come back exactly as it was populated in fprop.
+// Does dgrad and wgrad sequentially.
+template <typename T>
+int mlp_bp(
+    T* X,
+    T* Y,
+    int input_features,
+    int batch_size,
+    T** WPtr,
+    int num_layers,
+    int* output_features,
+    T* dY,
+    T* reserved_space,
+    T* work_space,
+    T* dX,
+    T** dwPtr,
+    T** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation) {
+  T* weight;
+  T *dweight, *dx, *dy, *dbias;
+  T *x, *y;
+  // Where the dx of the biasReLU (== dy of gemm) is stored. Can be thrown away
+  // after bp call.
+  T* dy_gemm_base;
+  // Where the dx after GEMM is stored.
+  T* dx_gemm_base;
+  // Where partial reduction results are stored.
+  float* db_scratch;
+  // Semaphores for reduction.
+  int* semaphores;
+  partition_mlp_bp_workspace<T>(
+      batch_size,
+      num_layers,
+      output_features,
+      work_space,
+      &dy_gemm_base,
+      &dx_gemm_base,
+      &db_scratch,
+      &semaphores);
+  size_t semaphore_size = get_semaphores_size(num_layers, output_features) * sizeof(int);
+  // Get cublas handle from Pytorch
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+  // Get the stream from cublas handle to reuse for biasReLU kernel.
+  cudaStream_t stream;
+  cublasGetStream(handle, &stream);
+  int* y_offsets = (int*)malloc(num_layers * sizeof(int));
+  get_y_offsets(batch_size, num_layers, output_features, y_offsets);
+  for (int layer = num_layers - 1; layer >= 0; layer--) {
+    weight = WPtr[layer];
+    dweight = dwPtr[layer];
+    // x is read from reserved space
+    x = (layer == 0) ? X : reserved_space + y_offsets[layer - 1];
+    // dx is written in workspace for all but layer==0
+    dx = (layer == 0) ? dX : dx_gemm_base + y_offsets[layer - 1];
+    // y is read from reserved space
+    y = (layer == num_layers - 1) ? Y : reserved_space + y_offsets[layer];
+    // dx from layer+1
+    dy = (layer == num_layers - 1) ? dY : dx_gemm_base + y_offsets[layer];
+    // dy_gemm is written to and read immediately
+    T* dy_gemm = dy_gemm_base + y_offsets[layer];
+    dbias = dbPtr[layer];
+    int xfeat = (layer == 0) ? input_features : output_features[layer - 1];
+    int yfeat = output_features[layer];
+    float one = 1.f;
+    float zero = 0.f;
+    if (use_bias == 1) {
+      if (activation == 0) { // no acitvation
+        // bgrad
+        dim3 block(BIAS_RELU_BW_NTHREADS_X, BIAS_RELU_BW_NTHREADS_Y);
+        int grid_x, grid_y;
+        cudaMemsetAsync(semaphores, 0, semaphore_size, stream);
+        int block_x = BIAS_RELU_BW_NTHREADS_X;
+        int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+        get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
+        dim3 grid(grid_x, grid_y);
+        biasAdd_bprop<T, 4><<<grid, block, 0, stream>>>(
+          dy, yfeat, batch_size, db_scratch, semaphores, dbias);
+        // bypass dgrad through reset pointer
+        dy_gemm = dy;
+      } else if (activation == 1) { // relu
+        dim3 block(BIAS_RELU_BW_NTHREADS_X, BIAS_RELU_BW_NTHREADS_Y);
+        int grid_x, grid_y;
+        cudaMemsetAsync(semaphores, 0, semaphore_size, stream);
+        if(yfeat % (ILP * BIAS_RELU_BW_NTHREADS_X) == 0 &&
+           is_aligned(y) &&
+           is_aligned(dy) &&
+           is_aligned(dy_gemm) &&
+           is_aligned(dbias)){
+          int block_x = ILP * BIAS_RELU_BW_NTHREADS_X;
+          int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+          get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
+          dim3 grid(grid_x, grid_y);
+          biasAddRelu_bprop_aligned<T, 4><<<grid, block, 0, stream>>>(
+            y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias);
+        } else {
+          int block_x = BIAS_RELU_BW_NTHREADS_X;
+          int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+          get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
+          dim3 grid(grid_x, grid_y);
+          biasAddRelu_bprop<T, 4><<<grid, block, 0, stream>>>(
+            y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias);
+        }
+      } else if (activation == 2) { // sigmoid
+        // activation backward
+        int num_blocks = 0;
+        int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_bprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Sigmoid_bprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(dy, y, batch_size, yfeat, dy_gemm);
+        // bgrad, from dy_gemm
+        dim3 block(BIAS_RELU_BW_NTHREADS_X, BIAS_RELU_BW_NTHREADS_Y);
+        int grid_x, grid_y;
+        cudaMemsetAsync(semaphores, 0, semaphore_size, stream);
+        int block_x = BIAS_RELU_BW_NTHREADS_X;
+        int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+        get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
+        dim3 grid(grid_x, grid_y);
+        biasAdd_bprop<T, 4><<<grid, block, 0, stream>>>(
+          dy_gemm, yfeat, batch_size, db_scratch, semaphores, dbias);
+      }
+    } else { // no bias below
+      if (activation == 0) {
+        // bypass dgrad through reset pointer
+        dy_gemm = dy;
+      } else if (activation == 1) { // relu
+        int num_blocks = 0;
+        int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Relu_bprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Relu_bprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(dy, y, batch_size, yfeat, dy_gemm);
+      } else if (activation == 2) { // sigmoid
+        int num_blocks = 0;
+        int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_bprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Sigmoid_bprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(dy, y, batch_size, yfeat, dy_gemm);
+      }
+    }
+    cublasStatus_t cublas_status;
+    // Call GEMM dgrad
+    if (layer > 0 || requires_grad == 1) {
+      cublas_status = mlp_gemm(
+        handle,
+        CUBLAS_OP_N,
+        CUBLAS_OP_N,
+        xfeat,
+        batch_size,
+        yfeat,
+        &one,
+        weight,
+        xfeat,
+        dy_gemm,
+        yfeat,
+        &zero,
+        dx,
+        xfeat);
+      if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+        printf("GEMM dgrad failed with %d\n", cublas_status);
+        return 1;
+      }
+    }
+    // Call GEMM wgrad
+    cublas_status = mlp_gemm(
+        handle,
+        CUBLAS_OP_N,
+        CUBLAS_OP_T,
+        xfeat,
+        yfeat,
+        batch_size,
+        &one,
+        x,
+        xfeat,
+        dy_gemm,
+        yfeat,
+        &zero,
+        dweight,
+        xfeat);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+      printf("GEMM wgrad failed with %d\n", cublas_status);
+      return 1;
+    }
+  }
+  return 0;
+}
+// Instantiate for floating point types
+template int mlp_fp<float>(
+    float* X,
+    int input_features,
+    int batch_size,
+    float** WPtr,
+    int num_layers,
+    int* output_features,
+    float** BPtr,
+    float* Y,
+    float* reserved_space,
+    int use_bias,
+    int activation);
+template int mlp_bp<float>(
+    float* X,
+    float* Y,
+    int input_features,
+    int batch_size,
+    float** WPtr,
+    int num_layers,
+    int* output_features,
+    float* dY,
+    float* reserved_space,
+    float* work_space,
+    float* dX,
+    float** dwPtr,
+    float** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation);
+template int mlp_fp<at::Half>(
+    at::Half* X,
+    int input_features,
+    int batch_size,
+    at::Half** WPtr,
+    int num_layers,
+    int* output_features,
+    at::Half** BPtr,
+    at::Half* Y,
+    at::Half* reserved_space,
+    int use_bias,
+    int activation);
+template int mlp_bp<at::Half>(
+    at::Half* X,
+    at::Half* Y,
+    int input_features,
+    int batch_size,
+    at::Half** WPtr,
+    int num_layers,
+    int* output_features,
+    at::Half* dY,
+    at::Half* reserved_space,
+    at::Half* work_space,
+    at::Half* dX,
+    at::Half** dwPtr,
+    at::Half** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation);
+template int mlp_fp<double>(
+    double* X,
+    int input_features,
+    int batch_size,
+    double** WPtr,
+    int num_layers,
+    int* output_features,
+    double** BPtr,
+    double* Y,
+    double* reserved_space,
+    int use_bias,
+    int activation);
+template int mlp_bp<double>(
+    double* X,
+    double* Y,
+    int input_features,
+    int batch_size,
+    double** WPtr,
+    int num_layers,
+    int* output_features,
+    double* dY,
+    double* reserved_space,
+    double* work_space,
+    double* dX,
+    double** dwPtr,
+    double** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation);
+template size_t get_mlp_bp_workspace_in_bytes<float>(
+    int batch_size,
+    int num_layers,
+    const int* output_features);
+template size_t get_mlp_bp_workspace_in_bytes<at::Half>(
+    int batch_size,
+    int num_layers,
+    const int* output_features);
+template size_t get_mlp_bp_workspace_in_bytes<double>(
+    int batch_size,
+    int num_layers,
+    const int* output_features);
--- a/csrc/multi_tensor_axpby_kernel.cu
+++ b/csrc/multi_tensor_axpby_kernel.cu
@@ -13,6 +13,17 @@
 #define BLOCK_SIZE 512
 #define ILP 4
+template<typename T>
+__device__ __forceinline__ bool is_aligned(T* p){
+  return ((uint64_t)p) % (ILP*sizeof(T)) == 0;
+}
+template<typename T>
+__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){
+  typedef typename std::aligned_storage<ILP*sizeof(T), ILP*alignof(T)>::type LT;
+  ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset];
+}
 template<typename x_t, typename y_t, typename out_t>
 struct AxpbyFunctor
 {
@@ -43,46 +54,74 @@ struct AxpbyFunctor
    n -= chunk_idx*chunk_size;
+    bool finite = true;
+    x_t r_x[ILP];
+    y_t r_y[ILP];
+    out_t r_out[ILP];
+    // to make things simple, we put aligned case in a different code path
+    if(n % ILP == 0 && chunk_size % ILP == 0 && is_aligned(x) && is_aligned(y) && is_aligned(out))
+    {
+      for(int i_start = threadIdx.x; i_start*ILP < n && i_start*ILP < chunk_size; i_start += blockDim.x)
+      {
+        // load
+        load_store(r_x, x, 0 , i_start);
+        load_store(r_y, y, 0 , i_start);
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          r_out[ii] = a*static_cast<float>(r_x[ii]) + b*static_cast<float>(r_y[ii]);
+          if(arg_to_check == -1)
+            finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
+          if(arg_to_check == 0)
+            finite = finite && isfinite(r_x[ii]);
+          if(arg_to_check == 1)
+            finite = finite && isfinite(r_y[ii]);
+        }
+        // store
+        load_store(out, r_out, i_start , 0);
+      }
+    }
+    else
+    {
      // Non-divergent exit condition for __syncthreads, not necessary here
-    float xs[ILP];
+      for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
-    float ys[ILP];
-    for(int i_start = 0;
-        i_start < n && i_start < chunk_size;
-        i_start += blockDim.x*ILP)
      {
-      #pragma unroll
+#pragma unroll
        for(int ii = 0; ii < ILP; ii++)
        {
-        xs[ii] = 0;
+          r_x[ii] = 0;
-        ys[ii] = 0;
+          r_y[ii] = 0;
          int i = i_start + threadIdx.x + ii*blockDim.x;
          if(i < n && i < chunk_size)
          {
-          xs[ii] = static_cast<float>(x[i]);
+            r_x[ii] = x[i];
-          ys[ii] = static_cast<float>(y[i]);
+            r_y[ii] = y[i];
          }
        }
+#pragma unroll
-      // see note in multi_tensor_scale_kernel.cu
-      #pragma unroll
        for(int ii = 0; ii < ILP; ii++)
        {
-        int i = i_start + threadIdx.x + ii*blockDim.x;
+          r_out[ii] = a*static_cast<float>(r_x[ii]) + b*static_cast<float>(r_y[ii]);
-        if(i < n && i < chunk_size)
-        {
-          out[i] = static_cast<out_t>(a*xs[ii] + b*ys[ii]);
-          bool finite = true;
          if(arg_to_check == -1)
-            finite = (isfinite(xs[ii]) && isfinite(ys[ii]));
+            finite = finite && (isfinite(r_x[ii]) && isfinite(r_y[ii]));
          if(arg_to_check == 0)
-            finite = isfinite(xs[ii]);
+            finite = finite && isfinite(r_x[ii]);
          if(arg_to_check == 1)
-            finite = isfinite(ys[ii]);
+            finite = finite && isfinite(r_y[ii]);
-          if(!finite)
+        }
-            *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+        // see note in multi_tensor_scale_kernel.cu
+#pragma unroll
+        for(int ii = 0; ii < ILP; ii++)
+        {
+          int i = i_start + threadIdx.x + ii*blockDim.x;
+          if(i < n && i < chunk_size)
+            out[i] = r_out[ii];
        }
      }
    }
+    if(!finite)
+      *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
  }
 };