Improvements to apex.mlp (#804)

* update fused bias relu backward kernel * adding support for not require first layer dgrad * fix bug: wrong layer in requires grad * add infrastructure for optional bias and activation, currently only support no bias and no relu * make bias and relu optional separately * add sigmoid activation option

Improvements to apex.mlp (#804)
* update fused bias relu backward kernel * adding support for not require first layer dgrad * fix bug: wrong layer in requires grad * add infrastructure for optional bias and activation, currently only support no bias and no relu * make bias and relu optional separately * add sigmoid activation option
31aceeaa · Deyu Fu · GitHub · aad9300b · 31aceeaa · 31aceeaa
Unverified Commit 31aceeaa authored Apr 30, 2020 by Deyu Fu Committed by GitHub Apr 30, 2020
Showing with 772 additions and 135 deletions

apex/mlp/mlp.py apex/mlp/mlp.py +28 -19

csrc/mlp.cpp csrc/mlp.cpp +41 -16

csrc/mlp_cuda.cu csrc/mlp_cuda.cu +593 -100

tests/L0/run_mlp/test_mlp.py tests/L0/run_mlp/test_mlp.py +110 -0

No files found.
--- a/apex/mlp/mlp.py
+++ b/apex/mlp/mlp.py
@@ -7,17 +7,19 @@ from .. import amp

 class MlpFunction(torch.autograd.Function):
    @staticmethod
-    def forward(ctx, *args):
-        output = mlp_cuda.forward(args)
+    def forward(ctx, bias, activation, *args):
+        output = mlp_cuda.forward(bias, activation, args)
        ctx.save_for_backward(*args)
        ctx.outputs = output
+        ctx.bias = bias
+        ctx.activation = activation
        return output[0]

    @staticmethod
    def backward(ctx, grad_o):
-        grads = mlp_cuda.backward(grad_o, ctx.outputs, ctx.saved_tensors)
+        grads = mlp_cuda.backward(ctx.bias, ctx.activation, grad_o, ctx.outputs, ctx.saved_tensors)
        del ctx.outputs
-        return tuple(grads)
+        return (None, None, *grads)

 mlp_function = amp.half_function(MlpFunction.apply)

@@ -29,16 +31,21 @@ class MLP(torch.nn.Module):
        bias (bool): Default True:
        relu (bool): Default True
    """
-    def __init__(self, mlp_sizes, bias=True, relu=True):
-        if not (bias and relu):
-            raise TypeError("bias and relu must be both true.")
+    def __init__(self, mlp_sizes, bias=True, activation='relu'):
        super(MLP, self).__init__()
        self.num_layers = len(mlp_sizes) - 1
        self.mlp_sizes = copy(mlp_sizes)
-        self.bias = bias
-        self.relu= relu
+        self.bias = 1 if bias else 0
+
+        if activation is 'none':
+            self.activation = 0
+        elif activation is 'relu':
+            self.activation = 1
+        elif activation is 'sigmoid':
+            self.activation = 2
+        else:
+            raise TypeError("activation must be relu or none.")

-        # ignoring bias = False now
        self.weights = []
        self.biases = []
        for i in range(self.num_layers):
@@ -46,10 +53,11 @@ class MLP(torch.nn.Module):
            self.weights.append(w)
            name = 'weight_{}'.format(i)
            setattr(self, name, w)
-            b = torch.nn.Parameter(torch.empty(mlp_sizes[i+1]))
-            self.biases.append(b)
-            name = 'bias_{}'.format(i)
-            setattr(self, name, b)
+            if self.bias:
+                b = torch.nn.Parameter(torch.empty(mlp_sizes[i+1]))
+                self.biases.append(b)
+                name = 'bias_{}'.format(i)
+                setattr(self, name, b)

        self.reset_parameters()

@@ -58,13 +66,14 @@ class MLP(torch.nn.Module):
            dimsum = weight.size(0) + weight.size(1)
            std = math.sqrt(2. / float(dimsum))
            nn.init.normal_(weight, 0., std)
-        for bias in self.biases:
-            std = math.sqrt(1. / float(bias.size(0)))
-            nn.init.normal_(bias, 0., std)
+        if self.bias:
+            for bias in self.biases:
+                std = math.sqrt(1. / float(bias.size(0)))
+                nn.init.normal_(bias, 0., std)

    def forward(self, input):
-        return mlp_function(input, *self.weights, *self.biases)
+        return mlp_function(self.bias, self.activation, input, *self.weights, *self.biases)

    def extra_repr(self):
-        s = F"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, ReLU={self.relu}"
+        s = F"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, activation={self.activation}"
        return s
--- a/csrc/mlp.cpp
+++ b/csrc/mlp.cpp
@@ -19,7 +19,9 @@ int mlp_fp(
    int* output_features,
    T** BPtr,
    T* Y,
-    T* reserved_space);
+    T* reserved_space,
+    int use_bias,
+    int activation);

 template <typename T>
 int mlp_bp(
@@ -35,11 +37,18 @@ int mlp_bp(
    T* work_space,
    T* dX,
    T** dwPtr,
-    T** dbPtr);
+    T** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation);
+
+std::vector<at::Tensor> mlp_forward(int use_bias, int activation, std::vector<at::Tensor> inputs) {

-std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
-  // inputs contains (input, weights, biases)
-  auto num_layers = (inputs.size() - 1) / 2;
+  auto num_layers = inputs.size() - 1;
+  if (use_bias) {
+    // inputs contains (input, weights, biases)
+    num_layers /= 2;
+  }
  auto batch_size = inputs[0].size(0);
  auto input_features = inputs[0].size(1);

@@ -60,7 +69,9 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
    std::vector<scalar_t*> b_ptr;
    for (int i = 0; i < num_layers; i++) {
      w_ptr.push_back(inputs[i + 1].data_ptr<scalar_t>());
-      b_ptr.push_back(inputs[i + 1 + num_layers].data_ptr<scalar_t>());
+      if (use_bias) {
+        b_ptr.push_back(inputs[i + 1 + num_layers].data_ptr<scalar_t>());
+      }
    }
    auto result = mlp_fp<scalar_t>(
        inputs[0].data_ptr<scalar_t>(),
@@ -71,37 +82,48 @@ std::vector<at::Tensor> mlp_forward(std::vector<at::Tensor> inputs) {
        output_features.data(),
        b_ptr.data(),
        out.data_ptr<scalar_t>(),
-        reserved_space.data_ptr<scalar_t>());
+        reserved_space.data_ptr<scalar_t>(),
+        use_bias,
+        activation);
  });

  return {out, reserved_space};
 }

 std::vector<at::Tensor> mlp_backward(
-    at::Tensor grad_o,
-    std::vector<at::Tensor> fprop_outputs,
-    std::vector<at::Tensor> inputs) {
-  // same code to get sizes and W pointers
-  auto num_layers = (inputs.size() - 1) / 2;
+  int use_bias,
+  int activation,
+  at::Tensor grad_o,
+  std::vector<at::Tensor> fprop_outputs,
+  std::vector<at::Tensor> inputs) {
+
+  auto num_layers = inputs.size() - 1;
+  if (use_bias) {
+    // inputs contains (input, weights, biases)
+    num_layers /= 2;
+  }
+
  auto batch_size = inputs[0].size(0);
  auto input_features = inputs[0].size(1);

+  // TODO: not creating empty tensor for it?
+  bool requires_grad = inputs[0].requires_grad();
+
  std::vector<int> output_features;
  for (int i = 0; i < num_layers; i++) {
    output_features.push_back(inputs[i + 1].size(0));
  }
  // create outputs, length of inputs
+  // TODO: not create bias if not needed
  std::vector<at::Tensor> outputs;
  for (int i = 0; i < inputs.size(); i++) {
    outputs.push_back(at::empty(inputs[i].sizes(), inputs[i].type()));  // clone for testing now
  }

-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].type(), "mlp_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].type(), "mlp_backward", [&] {
    std::vector<scalar_t*> w_ptr;
-    std::vector<scalar_t*> b_ptr;
    for (int i = 0; i < num_layers; i++) {
      w_ptr.push_back(inputs[i + 1].data_ptr<scalar_t>());
-      b_ptr.push_back(inputs[i + 1 + num_layers].data_ptr<scalar_t>());
    }
    std::vector<scalar_t*> outputs_ptr;
    for (int i = 0; i < inputs.size(); i++) {
@@ -127,7 +149,10 @@ std::vector<at::Tensor> mlp_backward(
        work_space.data_ptr<scalar_t>(),
        outputs_ptr[0],
        outputs_ptr.data() + 1,
-        outputs_ptr.data() + 1 + num_layers);
+        outputs_ptr.data() + 1 + num_layers,
+        requires_grad,
+        use_bias,
+        activation);
  });

  return outputs;

--- a/csrc/mlp_cuda.cu
+++ b/csrc/mlp_cuda.cu
@@ -10,8 +10,11 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>

-#define BIASADDRELU_FPROP_NUM_THREADS 128
-#define BIASADDRELU_BPROP_NUM_THREADS 128
+// constants for fused bias+relu kernel
+#define BIAS_RELU_FW_NTHREADS 128 // forward number of thread per block
+#define BIAS_RELU_BW_NTHREADS_X 32 // backward number of thread in feature dim
+#define BIAS_RELU_BW_NTHREADS_Y 16 // backward number of thread in batch dim
+#define BIAS_RELU_RED_PER_THREAD 16 // backward minimal reduction length per thread

 // move to a header later on
 #define ILP 4
@@ -42,6 +45,12 @@ __device__ __inline__ float relu(float a) {
  return (retf);
 }

+// Keep Sigmoid in float only. When using half, cast to float before calling.
+__device__ __inline__ float sigmoid(float a) {
+  float retf = 1.f / (1.f + expf(-a));
+  return (retf);
+}
+
 // FP64 Wrapper around cublas GEMMEx
 cublasStatus_t mlp_gemm(
    cublasHandle_t handle,
@@ -156,9 +165,55 @@ cublasStatus_t mlp_gemm(
      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 }

-// Bias ADD + ReLU. Assume input X is [features x batch size], assume column major.
+// Bias ADD. Assume input X is [features x batch size], column major.
 // Bias is one 'features' long vector, with implicit broadcast.
-// Currently, activation support fuesed ReLU. Safe to call in-place.
+template <typename T>
+__global__ void biasAdd_fprop(T *X, T *b, uint batch_size, uint features) {
+  T r_x[ILP];
+  T r_b[ILP];
+  if(is_aligned(X) && is_aligned(b) && features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      int row = tid % (features / ILP);
+      load_store(r_x, X, 0 , tid);
+      load_store(r_b, b, 0 , row);
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
+        r_x[ii] = bias_sum;
+      }
+      load_store(X, r_x, tid , 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          int row = tid % features;
+          r_x[ii] = X[idx];
+          r_b[ii] = b[row];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        float bias_sum = static_cast<float>(r_x[ii]) + static_cast<float>(r_b[ii]);
+        r_x[ii] = bias_sum;
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          X[idx] = r_x[ii];
+        }
+      }
+    }
+  }
+}
+
+// Bias ADD + ReLU. Assume input X is [features x batch size], column major.
+// Activation support fuesed ReLU. Safe to call in-place.
 template <typename T>
 __global__ void biasAddRelu_fprop(T *X, T *b, uint batch_size, uint features) {
  T r_x[ILP];
@@ -204,32 +259,308 @@ __global__ void biasAddRelu_fprop(T *X, T *b, uint batch_size, uint features) {
  }
 }

+// ReLU. Assume input X is [features x batch size], column major.
+// Safe to call in-place.
+template <typename T>
+__global__ void Relu_fprop(T *X, uint batch_size, uint features) {
+  T r_x[ILP];
+  if(is_aligned(X) && features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      load_store(r_x, X, 0 , tid);
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        r_x[ii] = relu(static_cast<float>(r_x[ii]));
+      }
+      load_store(X, r_x, tid , 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          r_x[ii] = X[idx];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        r_x[ii] = relu(static_cast<float>(r_x[ii]));
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          X[idx] = r_x[ii];
+        }
+      }
+    }
+  }
+}
+
+// Sigmoid. Assume input X is [features x batch size], column major.
+// Safe to call in-place.
+template <typename T>
+__global__ void Sigmoid_fprop(T *X, uint batch_size, uint features) {
+  T r_x[ILP];
+  if(is_aligned(X) && features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      load_store(r_x, X, 0 , tid);
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        r_x[ii] = sigmoid(static_cast<float>(r_x[ii]));
+      }
+      load_store(X, r_x, tid , 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          r_x[ii] = X[idx];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        r_x[ii] = sigmoid(static_cast<float>(r_x[ii]));
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          X[idx] = r_x[ii];
+        }
+      }
+    }
+  }
+}
+
+// ReLU. Assume input X is [features x batch size], column major.
+// Safe to call in-place.
+template <typename T>
+__global__ void Relu_bprop(T *dY, T *Y, uint batch_size, uint features, T *dX) {
+  T r_dy[ILP];
+  T r_y[ILP];
+  if(is_aligned(dY) &&
+     is_aligned(Y) &&
+     is_aligned(dX) &&
+     features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      load_store(r_dy, dY, 0 , tid);
+      load_store(r_y, Y, 0 , tid);
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        if ((float)r_y[ii] <= 0.f)
+          r_dy[ii] = 0;
+      }
+      load_store(dX, r_dy, tid, 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          r_dy[ii] = dY[idx];
+          r_y[ii] = Y[idx];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        if ((float)r_y[ii] <= 0.f)
+          r_dy[ii] = 0;
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          dX[idx] = r_dy[ii];
+        }
+      }
+    }
+  }
+}
+
+// Sigmoid. Assume input X is [features x batch size], column major.
+// Safe to call in-place.
+template <typename T>
+__global__ void Sigmoid_bprop(T *dY, T *Y, uint batch_size, uint features, T *dX) {
+  T r_dy[ILP];
+  T r_y[ILP];
+  if(is_aligned(dY) &&
+     is_aligned(Y) &&
+     is_aligned(dX) &&
+     features % ILP ==0) {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) {
+      load_store(r_dy, dY, 0 , tid);
+      load_store(r_y, Y, 0 , tid);
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        float grad_out = r_dy[ii];
+        float out = r_y[ii];
+        float grad_i = out * ( 1.f - out) * grad_out;
+        r_dy[ii] = grad_i;
+      }
+      load_store(dX, r_dy, tid, 0);
+    }
+  } else {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) {
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          r_dy[ii] = dY[idx];
+          r_y[ii] = Y[idx];
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        float grad_out = r_dy[ii];
+        float out = r_y[ii];
+        float grad_i = out * ( 1.f - out) * grad_out;
+        r_dy[ii] = grad_i;
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++) {
+        int idx = tid + ii * blockDim.x * gridDim.x;
+        if(idx < features * batch_size) {
+          dX[idx] = r_dy[ii];
+        }
+      }
+    }
+  }
+}
+
 // Compute grid size for pointwise backward kernel.
-// Some intelligence needed to determine number of splits for reduction.
+// block_x/y is total elment being handled per block, not number of threads
 void get_biasAddRelu_bprop_grid_size(
    int yfeat,
-    int threadsPerBlock,
    int batch_size,
+    int block_x,
+    int block_y,
    int* grid_x,
    int* grid_y) {
+
+  *grid_x = (yfeat + block_x - 1) / block_x;
  // Get number of SMs for efficient reduction.
  int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-  // First preference, whole reduction in 1 CTA
-  int nBlocks = (yfeat + threadsPerBlock - 1) / threadsPerBlock;
-
-  // Figure out how many splits to divide reduction into. At least 32 elements per CTA.
-  // we want grid_y as close to sqrt(batchsize)?
-  int nRedSplits = std::sqrt(batch_size);
-  // for batchsize <=64, just use 1 block
-  if(batch_size < 64) nRedSplits = 1;
-  // no need to go over occupancy
-  nRedSplits = min((8*num_SMs)/nBlocks, nRedSplits);
-
-  *grid_x = nBlocks;
-  *grid_y = nRedSplits;
+  // can switch to occupancy calculation. use 4 below now for sm_70
+  int max_blocks_y = num_SMs * 4 / (*grid_x);
+  // block_y should be from minimal work per thread
+  int nRedSplits = (batch_size + block_y - 1) / block_y;
+  // increase number of elem per thread redcution to not launch more than enough
+  // kernel adjust work, so here we just launch max block
+  *grid_y = std::min(nRedSplits, max_blocks_y);
  return;
 }

+// Addition done deterministically via a 2-pass approach. Each CTA writes out partial
+// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
+template <typename T, int UNROLL_FACTOR>
+__global__ void biasAdd_bprop(
+    T* dY,
+    int features,
+    int batch_size,
+    volatile float* intermediate,
+    int* semaphores,
+    T* db) {
+  // The feature that this thread is responsible for
+  int f = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Compute the span this thread is responsible for
+  // For this block
+  int b_chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
+  int b_nStart = blockIdx.y * b_chunkSize;
+  int b_nSpan = min(batch_size, b_nStart + b_chunkSize) - b_nStart;
+  // For this thread
+  int chunkSize = (b_chunkSize + blockDim.y - 1) / blockDim.y;
+  int nStart = threadIdx.y * chunkSize + b_nStart;
+  int nSpan = min(b_nStart + b_nSpan, nStart + chunkSize) - nStart;
+
+  volatile float* out = intermediate + blockIdx.y * features;
+
+  // Flag to trigger last reduction.
+  __shared__ bool isLastBlock;
+  // we know block size for now
+  __shared__ float smem[BIAS_RELU_BW_NTHREADS_X*BIAS_RELU_BW_NTHREADS_Y];
+
+  // Accumulate db in FP32 always
+  float db_local = 0;
+  if (f < features) {
+    int nidx = 0;
+    // Handle non-multiple of UNROLL_FACTOR residue
+    for (; nidx < nSpan % UNROLL_FACTOR; nidx++) {
+      int row, col, flat_idx;
+      row = f;
+      col = nStart + nidx;
+      flat_idx = col * features + row;
+      db_local += (float)dY[flat_idx];
+    }
+
+    // Handle meat of work
+    for (; (nidx + UNROLL_FACTOR - 1) < nSpan; nidx += UNROLL_FACTOR) {
+      int row, col, flat_idx;
+      row = f;
+      col = nStart + nidx;
+      flat_idx = col * features + row;
+#pragma unroll 4
+      for (int u = 0; u < UNROLL_FACTOR; u++) {
+        db_local += (float)dY[flat_idx];
+        flat_idx += features;
+      }
+    }
+
+    // naive block reduction on y-dim
+    int linear_idx = threadIdx.y * blockDim.x + threadIdx.x;
+    smem[linear_idx] = db_local;
+  }
+  __syncthreads();
+  if (f < features) {
+    if(threadIdx.y == 0) {
+      for(int yidx = 1; yidx < blockDim.y; yidx++){
+        db_local += smem[yidx * blockDim.x + threadIdx.x];
+      }
+
+      // block result is in db_local now for all threadIdx.y == 0
+      // Write out partial result
+      out[f] = db_local;
+    }
+  }
+  __threadfence();
+  __syncthreads();
+
+  // Increment semaphore and check if this is the last CTA in the grid_y dimension.
+  // Only thread (0,0) calls this
+  if (threadIdx.x == 0 && threadIdx.y == 0 && f < features) {
+    unsigned int sum_idx;
+    sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1);
+    isLastBlock = (sum_idx == (gridDim.y - 1));
+  }
+  __syncthreads();
+
+  db_local = 0;
+  // No block reduction for now, only thread (*,0) do grid reduction
+  if (isLastBlock && f < features) {
+    if(threadIdx.y == 0) {
+      for (int n = 0; n < gridDim.y; n++) {
+        int row, col;
+        row = f;
+        col = n;
+        db_local += (float)(intermediate[col * features + row]);
+      }
+      db[f] = (T)db_local;
+    }
+  }
+}
+
 // Addition done deterministically via a 2-pass approach. Each CTA writes out partial
 // sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result.
 template <typename T, int UNROLL_FACTOR>
@@ -245,14 +576,22 @@ __global__ void biasAddRelu_bprop(
  // The feature that this thread is responsible for
  int f = blockIdx.x * blockDim.x + threadIdx.x;

-  // Compute the batch span this thread is responsible for
-  int chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
-  int nStart = blockIdx.y * chunkSize;
-  int nSpan = min(batch_size, nStart + chunkSize) - nStart;
+  // Compute the span this thread is responsible for
+  // For this block
+  int b_chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
+  int b_nStart = blockIdx.y * b_chunkSize;
+  int b_nSpan = min(batch_size, b_nStart + b_chunkSize) - b_nStart;
+  // For this thread
+  int chunkSize = (b_chunkSize + blockDim.y - 1) / blockDim.y;
+  int nStart = threadIdx.y * chunkSize + b_nStart;
+  int nSpan = min(b_nStart + b_nSpan, nStart + chunkSize) - nStart;
+
  volatile float* out = intermediate + blockIdx.y * features;

  // Flag to trigger last reduction.
  __shared__ bool isLastBlock;
+  // we know block size for now
+  __shared__ float smem[BIAS_RELU_BW_NTHREADS_X*BIAS_RELU_BW_NTHREADS_Y];

  // Accumulate db in FP32 always
  float db_local = 0;
@@ -296,15 +635,28 @@ __global__ void biasAddRelu_bprop(
      }
    }

-    // Write out partial result
-    out[f] = db_local;
+    // naive block reduction on y-dim
+    int linear_idx = threadIdx.y * blockDim.x + threadIdx.x;
+    smem[linear_idx] = db_local;
+  }
+  __syncthreads();
+  if (f < features) {
+    if(threadIdx.y == 0) {
+      for(int yidx = 1; yidx < blockDim.y; yidx++){
+        db_local += smem[yidx * blockDim.x + threadIdx.x];
+      }
+
+      // block result is in db_local now for all threadIdx.y == 0
+      // Write out partial result
+      out[f] = db_local;
+    }
  }
  __threadfence();
  __syncthreads();

-  // Increment semaphore and check if this is the last CTA in
-  // the grid_y dimension.
-  if (threadIdx.x == 0 && f < features) {
+  // Increment semaphore and check if this is the last CTA in the grid_y dimension.
+  // Only thread (0,0) calls this
+  if (threadIdx.x == 0 && threadIdx.y == 0 && f < features) {
    unsigned int sum_idx;
    sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1);
    isLastBlock = (sum_idx == (gridDim.y - 1));
@@ -312,14 +664,17 @@ __global__ void biasAddRelu_bprop(
  __syncthreads();

  db_local = 0;
+  // No block reduction for now, only thread (*,0) do grid reduction
  if (isLastBlock && f < features) {
-    for (int n = 0; n < gridDim.y; n++) {
-      int row, col;
-      row = f;
-      col = n;
-      db_local += (float)(intermediate[col * features + row]);
+    if(threadIdx.y == 0) {
+      for (int n = 0; n < gridDim.y; n++) {
+        int row, col;
+        row = f;
+        col = n;
+        db_local += (float)(intermediate[col * features + row]);
+      }
+      db[f] = (T)db_local;
    }
-    db[f] = (T)db_local;
  }
 }

@@ -338,10 +693,16 @@ __global__ void biasAddRelu_bprop_aligned(
  // The feature that this thread is responsible for
  int f = blockIdx.x * blockDim.x + threadIdx.x;

-  // Compute the batch span this thread is responsible for
-  int chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
-  int nStart = blockIdx.y * chunkSize;
-  int nSpan = min(batch_size, nStart + chunkSize) - nStart;
+  // Compute the span this thread is responsible for
+  // For this block
+  int b_chunkSize = (batch_size + gridDim.y - 1) / gridDim.y;
+  int b_nStart = blockIdx.y * b_chunkSize;
+  int b_nSpan = min(batch_size, b_nStart + b_chunkSize) - b_nStart;
+  // For this thread
+  int chunkSize = (b_chunkSize + blockDim.y - 1) / blockDim.y;
+  int nStart = threadIdx.y * chunkSize + b_nStart;
+  int nSpan = min(b_nStart + b_nSpan, nStart + chunkSize) - nStart;
+
  volatile float* out = intermediate + blockIdx.y * features;

  // Flag to trigger last reduction.
@@ -399,24 +760,45 @@ __global__ void biasAddRelu_bprop_aligned(
    }
  }

-  if(gridDim.y == 1) {
+  // we know block size for now
+  __shared__ float smem[BIAS_RELU_BW_NTHREADS_X*BIAS_RELU_BW_NTHREADS_Y*ILP];
+  // naive block reduction on y-dim
+  int linear_idx = threadIdx.y * blockDim.x + threadIdx.x;
+  float* smem_out = smem + ILP * linear_idx;
 #pragma unroll
-    for(int ii=0;ii<ILP;ii++){
-      r_dy[ii] = db_local[ii]; // reuse local dy buffer
-    }
-    load_store(db, r_dy, f, 0);
-    return;
+  for(int ii=0;ii<ILP;ii++){
+    smem_out[ii] = db_local[ii]; // reuse local dy buffer
  }
+  __syncthreads();
+  if(threadIdx.y == 0) {
+    for(int yidx = 1; yidx < blockDim.y; yidx++){
+      float* smem_in = smem + ILP * (yidx * blockDim.x + threadIdx.x);
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        db_local[ii] += smem_in[ii]; // reuse local dy buffer
+      }
+    }

-  // Write out partial result
-  load_store(out, db_local, f, 0);
+    // block result is in db_local now for all threadIdx.y == 0
+    // TODO: maybe not useful early exit here
+    if(gridDim.y == 1) {
+#pragma unroll
+      for(int ii=0;ii<ILP;ii++){
+        r_dy[ii] = db_local[ii]; // reuse local dy buffer
+      }
+      load_store(db, r_dy, f, 0);
+      return;
+    }

+    // Write out partial result
+    load_store(out, db_local, f, 0);
+  }
  __threadfence();
  __syncthreads();

-  // Increment semaphore and check if this is the last CTA in
-  // the grid_y dimension.
-  if (threadIdx.x == 0) {
+  // Increment semaphore and check if this is the last CTA in the grid_y dimension.
+  // Only thread (0,0) calls this
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
    unsigned int sum_idx;
    sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1);
    isLastBlock = (sum_idx == (gridDim.y - 1));
@@ -428,22 +810,26 @@ __global__ void biasAddRelu_bprop_aligned(
    db_local[ii] = 0.f;
  }
  float r_db[ILP];
+
+  // No block reduction for now, only thread (*,0) do grid reduction
  if (isLastBlock) {
-    for (int n = 0; n < gridDim.y; n++) {
-      int row, col;
-      row = f;
-      col = n;
-      load_store(r_db, intermediate, 0, col * features / ILP + row);
+    if(threadIdx.y == 0){
+      for (int n = 0; n < gridDim.y; n++) {
+        int row, col;
+        row = f;
+        col = n;
+        load_store(r_db, intermediate, 0, col * features / ILP + row);
 #pragma unroll
-      for(int ii=0;ii<ILP;ii++){
-        db_local[ii] += r_db[ii];
+        for(int ii=0;ii<ILP;ii++){
+          db_local[ii] += r_db[ii];
+        }
      }
-    }
 #pragma unroll
-    for(int ii=0;ii<ILP;ii++){
-      r_dy[ii] = db_local[ii]; // reuse local dy buffer
+      for(int ii=0;ii<ILP;ii++){
+        r_dy[ii] = db_local[ii]; // reuse local dy buffer
+      }
+      load_store(db, r_dy, f, 0);
    }
-    load_store(db, r_dy, f, 0);
  }
 }

@@ -502,10 +888,20 @@ size_t get_reduction_scratch_space(int batch_size, int num_layers, const int* ou
  size_t max_scratch_space = 0;
  // Loop over all layers to see which one needs the max scratch space
  for (int l = 0; l < num_layers; l++) {
-    int tmp, num_splits;
+    // need to find max(aligned, not_aligned)
+    int tmp, res0, res1;
+
+    int block_x = BIAS_RELU_BW_NTHREADS_X;
+    int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+    get_biasAddRelu_bprop_grid_size(
+      output_features[l], batch_size, block_x, block_y, &tmp, &res0);
+
+    block_x = ILP * BIAS_RELU_BW_NTHREADS_X;
    get_biasAddRelu_bprop_grid_size(
-        output_features[l], BIASADDRELU_BPROP_NUM_THREADS, batch_size, &tmp, &num_splits);
-    max_scratch_space = std::max(max_scratch_space, (size_t)(output_features[l] * num_splits));
+      output_features[l], batch_size, block_x, block_y, &tmp, &res1);
+
+    max_scratch_space = std::max(max_scratch_space, (size_t)(output_features[l] * res0));
+    max_scratch_space = std::max(max_scratch_space, (size_t)(output_features[l] * res1));
  }

  return max_scratch_space;
@@ -581,7 +977,9 @@ int mlp_fp(
    int* output_features,
    T** BPtr,
    T* Y,
-    T* reserved_space) {
+    T* reserved_space,
+    int use_bias,
+    int activation) {
  T *weight, *input, *output, *bias;
  T *reserved_space_x, *reserved_space_y;
  reserved_space_x = NULL;
@@ -597,7 +995,9 @@ int mlp_fp(
    weight = WPtr[layer];
    input = (layer == 0) ? X : reserved_space_x;
    output = (layer == num_layers - 1) ? Y : reserved_space_y;
-    bias = BPtr[layer];
+    if (use_bias) {
+      bias = BPtr[layer];
+    }
    int ifeat = (layer == 0) ? input_features : output_features[layer - 1];
    int ofeat = output_features[layer];

@@ -627,12 +1027,33 @@ int mlp_fp(
      return 1;
    }

-    // Call biasReLU
    const uint &input_size = ofeat;
    int num_blocks = 0;
    int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAddRelu_fprop<T>, BIASADDRELU_FPROP_NUM_THREADS, 0);
-    biasAddRelu_fprop<<<num_SMs*num_blocks, BIASADDRELU_FPROP_NUM_THREADS, 0, stream>>>(output, bias, batch_size, input_size);
+    // Call biasReLU
+    if(use_bias == 1) {
+      if (activation == 0) { // no activation
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAdd_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        biasAdd_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, bias, batch_size, input_size);
+      } else if (activation == 1) { // relu
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAddRelu_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        biasAddRelu_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, bias, batch_size, input_size);
+      } else if (activation == 2) { // sigmoid
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAdd_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        biasAdd_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, bias, batch_size, input_size);
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Sigmoid_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, batch_size, input_size);
+      }
+    } else {
+      // don't need to do anything in case of no activation and no bias
+      if (activation == 1) { // relu
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Relu_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Relu_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, batch_size, input_size);
+      } else if (activation == 2) { // sigmoid
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_fprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Sigmoid_fprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(output, batch_size, input_size);
+      }
+    }

    // Set current output as next layer input
    reserved_space_x = reserved_space_y;
@@ -660,7 +1081,10 @@ int mlp_bp(
    T* work_space,
    T* dX,
    T** dwPtr,
-    T** dbPtr) {
+    T** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation) {
  T* weight;
  T *dweight, *dx, *dy, *dbias;
  T *x, *y;
@@ -719,32 +1143,85 @@ int mlp_bp(
    float one = 1.f;
    float zero = 0.f;

-    // Call bias ReLU backprop - first implementation, 1 thread per bias element
-    int threadsPerBlock = BIASADDRELU_BPROP_NUM_THREADS;
-    int grid_x, grid_y;
-    get_biasAddRelu_bprop_grid_size(yfeat, threadsPerBlock, batch_size, &grid_x, &grid_y);
-
-    dim3 block(threadsPerBlock);
-
-    cudaMemsetAsync(semaphores, 0, semaphore_size, stream);
-
-    if(yfeat % (ILP * threadsPerBlock) == 0 &&
-       is_aligned(y) &&
-       is_aligned(dy) &&
-       is_aligned(dy_gemm) &&
-       is_aligned(dbias)){
-      dim3 grid(grid_x/ILP, grid_y);
-      biasAddRelu_bprop_aligned<T, 4><<<grid, block, 0, stream>>>(
-        y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias);
-    } else {
-      dim3 grid(grid_x, grid_y);
-      biasAddRelu_bprop<T, 4><<<grid, block, 0, stream>>>(
-        y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias);
+    if (use_bias == 1) {
+      if (activation == 0) { // no acitvation
+        // bgrad
+        dim3 block(BIAS_RELU_BW_NTHREADS_X, BIAS_RELU_BW_NTHREADS_Y);
+        int grid_x, grid_y;
+        cudaMemsetAsync(semaphores, 0, semaphore_size, stream);
+
+        int block_x = BIAS_RELU_BW_NTHREADS_X;
+        int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+        get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
+        dim3 grid(grid_x, grid_y);
+        biasAdd_bprop<T, 4><<<grid, block, 0, stream>>>(
+          dy, yfeat, batch_size, db_scratch, semaphores, dbias);
+        // bypass dgrad through reset pointer
+        dy_gemm = dy;
+      } else if (activation == 1) { // relu
+        dim3 block(BIAS_RELU_BW_NTHREADS_X, BIAS_RELU_BW_NTHREADS_Y);
+        int grid_x, grid_y;
+        cudaMemsetAsync(semaphores, 0, semaphore_size, stream);
+
+        if(yfeat % (ILP * BIAS_RELU_BW_NTHREADS_X) == 0 &&
+           is_aligned(y) &&
+           is_aligned(dy) &&
+           is_aligned(dy_gemm) &&
+           is_aligned(dbias)){
+          int block_x = ILP * BIAS_RELU_BW_NTHREADS_X;
+          int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+          get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
+          dim3 grid(grid_x, grid_y);
+          biasAddRelu_bprop_aligned<T, 4><<<grid, block, 0, stream>>>(
+            y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias);
+        } else {
+          int block_x = BIAS_RELU_BW_NTHREADS_X;
+          int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+          get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
+          dim3 grid(grid_x, grid_y);
+          biasAddRelu_bprop<T, 4><<<grid, block, 0, stream>>>(
+            y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias);
+        }
+      } else if (activation == 2) { // sigmoid
+        // activation backward
+        int num_blocks = 0;
+        int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_bprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Sigmoid_bprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(dy, y, batch_size, yfeat, dy_gemm);
+
+        // bgrad, from dy_gemm
+        dim3 block(BIAS_RELU_BW_NTHREADS_X, BIAS_RELU_BW_NTHREADS_Y);
+        int grid_x, grid_y;
+        cudaMemsetAsync(semaphores, 0, semaphore_size, stream);
+
+        int block_x = BIAS_RELU_BW_NTHREADS_X;
+        int block_y = BIAS_RELU_RED_PER_THREAD * BIAS_RELU_BW_NTHREADS_Y;
+        get_biasAddRelu_bprop_grid_size(yfeat, batch_size, block_x, block_y, &grid_x, &grid_y);
+        dim3 grid(grid_x, grid_y);
+        biasAdd_bprop<T, 4><<<grid, block, 0, stream>>>(
+          dy_gemm, yfeat, batch_size, db_scratch, semaphores, dbias);
+      }
+    } else { // no bias below
+      if (activation == 0) {
+        // bypass dgrad through reset pointer
+        dy_gemm = dy;
+      } else if (activation == 1) { // relu
+        int num_blocks = 0;
+        int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Relu_bprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Relu_bprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(dy, y, batch_size, yfeat, dy_gemm);
+      } else if (activation == 2) { // sigmoid
+        int num_blocks = 0;
+        int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+        cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, Sigmoid_bprop<T>, BIAS_RELU_FW_NTHREADS, 0);
+        Sigmoid_bprop<<<num_SMs*num_blocks, BIAS_RELU_FW_NTHREADS, 0, stream>>>(dy, y, batch_size, yfeat, dy_gemm);
+      }
    }

    cublasStatus_t cublas_status;
    // Call GEMM dgrad
-    cublas_status = mlp_gemm(
+    if (layer > 0 || requires_grad == 1) {
+      cublas_status = mlp_gemm(
        handle,
        CUBLAS_OP_N,
        CUBLAS_OP_N,
@@ -760,9 +1237,10 @@ int mlp_bp(
        dx,
        xfeat);

-    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
-      printf("GEMM dgrad failed with %d\n", cublas_status);
-      return 1;
+      if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+        printf("GEMM dgrad failed with %d\n", cublas_status);
+        return 1;
+      }
    }

    // Call GEMM wgrad
@@ -801,7 +1279,9 @@ template int mlp_fp<float>(
    int* output_features,
    float** BPtr,
    float* Y,
-    float* reserved_space);
+    float* reserved_space,
+    int use_bias,
+    int activation);

 template int mlp_bp<float>(
    float* X,
@@ -816,7 +1296,10 @@ template int mlp_bp<float>(
    float* work_space,
    float* dX,
    float** dwPtr,
-    float** dbPtr);
+    float** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation);

 template int mlp_fp<at::Half>(
    at::Half* X,
@@ -827,7 +1310,9 @@ template int mlp_fp<at::Half>(
    int* output_features,
    at::Half** BPtr,
    at::Half* Y,
-    at::Half* reserved_space);
+    at::Half* reserved_space,
+    int use_bias,
+    int activation);

 template int mlp_bp<at::Half>(
    at::Half* X,
@@ -842,7 +1327,10 @@ template int mlp_bp<at::Half>(
    at::Half* work_space,
    at::Half* dX,
    at::Half** dwPtr,
-    at::Half** dbPtr);
+    at::Half** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation);

 template int mlp_fp<double>(
    double* X,
@@ -853,7 +1341,9 @@ template int mlp_fp<double>(
    int* output_features,
    double** BPtr,
    double* Y,
-    double* reserved_space);
+    double* reserved_space,
+    int use_bias,
+    int activation);

 template int mlp_bp<double>(
    double* X,
@@ -868,7 +1358,10 @@ template int mlp_bp<double>(
    double* work_space,
    double* dX,
    double** dwPtr,
-    double** dbPtr);
+    double** dbPtr,
+    bool requires_grad,
+    int use_bias,
+    int activation);

 template size_t get_mlp_bp_workspace_in_bytes<float>(
    int batch_size,

--- a/tests/L0/run_mlp/test_mlp.py
+++ b/tests/L0/run_mlp/test_mlp.py
@@ -51,6 +51,116 @@ class TestMLP(unittest.TestCase):
            ref_mlp[0].bias.grad.detach().cpu().numpy(),
            atol=1e-7, rtol=1e-5)

+    def test_no_bias(self):
+        for use_activation in ['none', 'relu', 'sigmoid']:
+            mlp = MLP(mlp_sizes, bias=False, activation=use_activation).cuda()
+
+            mlp_layers = []
+            for i in range(mlp.num_layers):
+                linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=False)
+                mlp.weights[i].data.copy_(linear.weight)
+                mlp_layers.append(linear)
+                if use_activation == 'relu':
+                    mlp_layers.append(nn.ReLU(inplace=True))
+                if use_activation == 'sigmoid':
+                    mlp_layers.append(nn.Sigmoid())
+
+            ref_mlp = nn.Sequential(*mlp_layers).cuda()
+
+            test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
+            ref_input = test_input.clone().detach().requires_grad_()
+            mlp_out = mlp(test_input)
+            ref_out = ref_mlp(ref_input)
+            np.testing.assert_allclose(
+                mlp_out.detach().cpu().numpy(),
+                ref_out.detach().cpu().numpy(),
+                atol=1e-7, rtol=1e-5)
+
+            # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
+            mlp_out.mean().mul(10.).backward()
+            ref_out.mean().mul(10.).backward()
+            np.testing.assert_allclose(
+                test_input.grad.detach().cpu().numpy(),
+                ref_input.grad.detach().cpu().numpy(),
+                atol=0, rtol=100)
+            np.testing.assert_allclose(
+                mlp.weights[0].grad.detach().cpu().numpy(),
+                ref_mlp[0].weight.grad.detach().cpu().numpy(),
+                atol=1e-7, rtol=100)
+
+    def test_with_bias(self):
+        for use_activation in ['none', 'relu', 'sigmoid']:
+            mlp = MLP(mlp_sizes, bias=True, activation=use_activation).cuda()
+
+            mlp_layers = []
+            for i in range(mlp.num_layers):
+                linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1], bias=True)
+                mlp.weights[i].data.copy_(linear.weight)
+                mlp.biases[i].data.copy_(linear.bias)
+                mlp_layers.append(linear)
+                if use_activation == 'relu':
+                    mlp_layers.append(nn.ReLU(inplace=True))
+                if use_activation == 'sigmoid':
+                    mlp_layers.append(nn.Sigmoid())
+
+            ref_mlp = nn.Sequential(*mlp_layers).cuda()
+
+            test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_()
+            ref_input = test_input.clone().detach().requires_grad_()
+            mlp_out = mlp(test_input)
+            ref_out = ref_mlp(ref_input)
+            np.testing.assert_allclose(
+                mlp_out.detach().cpu().numpy(),
+                ref_out.detach().cpu().numpy(),
+                atol=1e-7, rtol=1e-5)
+
+            # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
+            mlp_out.mean().mul(10.).backward()
+            ref_out.mean().mul(10.).backward()
+            np.testing.assert_allclose(
+                test_input.grad.detach().cpu().numpy(),
+                ref_input.grad.detach().cpu().numpy(),
+                atol=0, rtol=1)
+            np.testing.assert_allclose(
+                mlp.weights[0].grad.detach().cpu().numpy(),
+                ref_mlp[0].weight.grad.detach().cpu().numpy(),
+                atol=1e-7, rtol=1)
+            np.testing.assert_allclose(
+                mlp.biases[0].grad.detach().cpu().numpy(),
+                ref_mlp[0].bias.grad.detach().cpu().numpy(),
+                atol=1e-7, rtol=1e-5)
+
+    def test_no_grad(self):
+        mlp = MLP(mlp_sizes).cuda()
+
+        mlp_layers = []
+        for i in range(mlp.num_layers):
+            linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1])
+            mlp.weights[i].data.copy_(linear.weight)
+            mlp.biases[i].data.copy_(linear.bias)
+            mlp_layers.append(linear)
+            mlp_layers.append(nn.ReLU(inplace=True))
+
+        ref_mlp = nn.Sequential(*mlp_layers).cuda()
+
+        test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.)
+        ref_input = test_input.clone().detach()
+        mlp_out = mlp(test_input)
+        ref_out = ref_mlp(ref_input)
+        np.testing.assert_allclose(
+            mlp_out.detach().cpu().numpy(),
+            ref_out.detach().cpu().numpy(),
+            atol=1e-7, rtol=1e-5)
+
+        # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out
+        mlp_out.mean().mul(10.).backward()
+        ref_out.mean().mul(10.).backward()
+        np.testing.assert_allclose(
+            mlp.weights[0].grad.detach().cpu().numpy(),
+            ref_mlp[0].weight.grad.detach().cpu().numpy(),
+            atol=1e-7, rtol=1e-5)
+
+
    def test_performance_half(self):
        mlp = MLP(mlp_sizes).cuda().half()