add lib (#61)

ed5456d3 · Hang Zhang · GitHub · d8abf505 · ed5456d3 · ed5456d3
Unverified Commit ed5456d3 authored Jun 04, 2018 by Hang Zhang Committed by GitHub Jun 04, 2018
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,5 @@ build/
 data/
 docs/src/
 docs/html/
-encoding/lib/
 encoding/_ext/
 encoding.egg-info/
-experiments/segmentation/
--- a/encoding/lib/__init__.py
+++ b/encoding/lib/__init__.py
+import os
+import torch
+from torch.utils.cpp_extension import load
+
+cwd = os.path.dirname(os.path.realpath(__file__))
+cpu_path = os.path.join(cwd, 'cpu')
+gpu_path = os.path.join(cwd, 'gpu')
+
+cpu = load( 'enclib_cpu', [
+        os.path.join(cpu_path, 'roi_align.cpp'),
+        os.path.join(cpu_path, 'roi_align_cpu.cpp'),
+    ], build_directory=cpu_path, verbose=False)
+
+if torch.cuda.is_available():
+    gpu = load( 'enclib_gpu', [
+            os.path.join(gpu_path, 'operator.cpp'),
+            os.path.join(gpu_path, 'encoding_kernel.cu'),
+            os.path.join(gpu_path, 'syncbn_kernel.cu'),
+            os.path.join(gpu_path, 'roi_align_kernel.cu'),
+        ], build_directory=gpu_path, verbose=False)
--- a/encoding/lib/cpu/__init__.py
+++ b/encoding/lib/cpu/__init__.py
--- a/encoding/lib/cpu/roi_align.cpp
+++ b/encoding/lib/cpu/roi_align.cpp
+#include <torch/torch.h>
+// CPU declarations
+
+at::Tensor ROIAlignForwardCPU(
+  const at::Tensor& input,
+  const at::Tensor& bottom_rois,
+  int64_t pooled_height,
+  int64_t pooled_width,
+  double spatial_scale,
+  int64_t sampling_ratio);
+
+at::Tensor ROIAlignBackwardCPU(
+  const at::Tensor& bottom_rois,
+  const at::Tensor& grad_output, // gradient of the output of the layer
+  int64_t b_size,
+  int64_t channels,
+  int64_t height,
+  int64_t width,
+  int64_t pooled_height,
+  int64_t pooled_width,
+  double spatial_scale,
+  int64_t sampling_ratio);
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("roi_align_forward", &ROIAlignForwardCPU, "ROI Align forward (CPU)");
+  m.def("roi_align_backward", &ROIAlignBackwardCPU, "ROI Align backward (CPU)");
+}
--- a/encoding/lib/cpu/roi_align_cpu.cpp
+++ b/encoding/lib/cpu/roi_align_cpu.cpp
+#include <ATen/ATen.h>
+//#include <omp.h>
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>* pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc->at(pre_calc_index) = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = static_cast<int>(y);
+          int x_low = static_cast<int>(x);
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc->at(pre_calc_index) = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForwardCompute(
+    const int nthreads,
+    const T* bottom_data,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    int roi_cols,
+    T* top_data) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    int roi_batch_ind = 0;
+    if (roi_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        &pre_calc);
+
+    int c;
+#pragma omp parallel for private(c) \
+num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_bottom_data =
+          bottom_data + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                  pc.w2 * offset_bottom_data[pc.pos2] +
+                  pc.w3 * offset_bottom_data[pc.pos3] +
+                  pc.w4 * offset_bottom_data[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          top_data[index] = output_val;
+        }  // for pw
+      }  // for ph
+    }  // for c
+  }  // for n
+}
+
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T* w1,
+    T* w2,
+    T* w3,
+    T* w4,
+    int* x_low,
+    int* x_high,
+    int* y_low,
+    int* y_high,
+    const int /*index*/ /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    *w1 = *w2 = *w3 = *w4 = 0.;
+    *x_low = *x_high = *y_low = *y_high = -1;
+    return;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  *y_low = static_cast<int>(y);
+  *x_low = static_cast<int>(x);
+
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = (T)*y_low;
+  } else {
+    *y_high = *y_low + 1;
+  }
+
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = (T)*x_low;
+  } else {
+    *x_high = *x_low + 1;
+  }
+
+  T ly = y - *y_low;
+  T lx = x - *x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(const T& val, T* address) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackwardCompute(
+    const int nthreads,
+    const T* top_diff,
+    const int /*num_rois*/,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois,
+    int rois_cols) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * rois_cols;
+    int roi_batch_ind = 0;
+    if (rois_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            &w1,
+            &w2,
+            &w3,
+            &w4,
+            &x_low,
+            &x_high,
+            &y_low,
+            &y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
+          add(static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
+          add(static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
+          add(static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
+        }  // if
+      }  // ix
+    }  // iy
+  }  // for
+}  // ROIAlignBackward
+
+
+at::Tensor ROIAlignForwardCPU(
+  const at::Tensor& input,
+  const at::Tensor& bottom_rois,
+  int64_t pooled_height,
+  int64_t pooled_width,
+  double spatial_scale,
+  int64_t sampling_ratio) {
+
+  AT_ASSERT(input.is_contiguous());
+  AT_ASSERT(bottom_rois.is_contiguous());
+  AT_ASSERT(input.ndimension() == 4);
+  AT_ASSERT(bottom_rois.ndimension() == 2);
+  AT_ASSERT(bottom_rois.size(1) == 5);
+
+  // ROIs is the set of region proposals to process. It is a 2D at::Tensor where the first
+  // dim is the # of proposals, and the second dim is the proposal itself in the form
+  // [batch_index startW startH endW endH]
+
+  auto num_rois = bottom_rois.size(0);
+  auto roi_cols = bottom_rois.size(1);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  AT_ASSERT(roi_cols == 4 || roi_cols == 5);
+
+  // Output at::Tensor is (num_rois, C, pooled_height, pooled_width)
+  auto output = input.type().tensor({num_rois, channels, pooled_height, pooled_width});
+
+  AT_ASSERT(input.is_contiguous());
+  AT_ASSERT(bottom_rois.is_contiguous());
+
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlignForwardCPU", ([&] {
+    ROIAlignForwardCompute<scalar_t>(
+      output.numel(),
+      input.data<scalar_t>(),
+      static_cast<scalar_t>(spatial_scale),
+      channels,
+      height,
+      width,
+      pooled_height,
+      pooled_width, 
+      sampling_ratio,
+      bottom_rois.data<scalar_t>(),
+      roi_cols,
+      output.data<scalar_t>());
+  }));
+
+  return output;
+}
+
+
+at::Tensor ROIAlignBackwardCPU(
+  const at::Tensor& bottom_rois,
+  const at::Tensor& grad_output, // gradient of the output of the layer
+  int64_t b_size,
+  int64_t channels,
+  int64_t height,
+  int64_t width,
+  int64_t pooled_height,
+  int64_t pooled_width,
+  double spatial_scale,
+  int64_t sampling_ratio)
+{
+  AT_ASSERT(bottom_rois.is_contiguous());
+  AT_ASSERT(bottom_rois.ndimension() == 2);
+  AT_ASSERT(bottom_rois.size(1) == 5);
+
+  auto num_rois = bottom_rois.size(0);
+  auto roi_cols = bottom_rois.size(1);
+
+  AT_ASSERT(roi_cols == 4 || roi_cols == 5);
+
+  // Output at::Tensor is (num_rois, C, pooled_height, pooled_width)
+  auto grad_in = bottom_rois.type().tensor({b_size, channels, height, width}).zero_(); 
+
+  AT_ASSERT(bottom_rois.is_contiguous());
+
+  AT_DISPATCH_FLOATING_TYPES(bottom_rois.type(), "ROIAlignBackwardCPU", ([&] {
+    ROIAlignBackwardCompute<scalar_t>(
+      grad_output.numel(), 
+      grad_output.data<scalar_t>(),
+      num_rois,
+      static_cast<scalar_t>(spatial_scale),
+      channels,
+      height, 
+      width, 
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      grad_in.data<scalar_t>(),
+      bottom_rois.data<scalar_t>(),
+      roi_cols);
+  }));
+
+  return grad_in;
+}
--- a/encoding/lib/cpu/setup.py
+++ b/encoding/lib/cpu/setup.py
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CppExtension
+
+setup(
+    name='enclib_cpu',
+    ext_modules=[
+        CppExtension('enclib_cpu', [
+            'roi_align.cpp',
+            'roi_align_cpu.cpp',
+            ]),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
--- a/encoding/lib/gpu/__init__.py
+++ b/encoding/lib/gpu/__init__.py
--- a/encoding/lib/gpu/common.h
+++ b/encoding/lib/gpu/common.h
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+static const unsigned WARP_SIZE = 32;
+
+// The maximum number of threads in a block
+static const unsigned MAX_BLOCK_SIZE = 512U;
+
+template<typename In, typename Out>
+struct ScalarConvert {
+  static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
+};
+
+// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
+static int getNumThreads(int nElem) {
+  int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
+  for (int i = 0; i != 5; ++i) {
+    if (nElem <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return MAX_BLOCK_SIZE;
+}
+
+// Returns the index of the most significant 1 bit in `val`.
+__device__ __forceinline__ int getMSB(int val) {
+  return 31 - __clz(val);
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+    return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+    return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+// Sum across all threads within a warp
+template <typename T>
+static __device__ __forceinline__ T warpSum(T val) {
+#if __CUDA_ARCH__ >= 300
+  for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
+    val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
+  }
+#else
+  __shared__ T values[MAX_BLOCK_SIZE];
+  values[threadIdx.x] = val;
+  __threadfence_block();
+  const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
+  for (int i = 1; i < WARP_SIZE; i++) {
+    val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
+  }
+#endif
+  return val;
+}
+
+template <typename DType, typename Acctype>
+struct Float2 {
+  Acctype v1, v2;
+  __device__ Float2() {}
+  __device__ Float2(DType v1, DType v2) : v1(ScalarConvert<DType, Acctype>::to(v1)), v2(ScalarConvert<DType, Acctype>::to(v2)) {}
+  __device__ Float2(DType v) : v1(ScalarConvert<DType, Acctype>::to(v)), v2(ScalarConvert<DType, Acctype>::to(v)) {}
+  __device__ Float2(int v) : v1(ScalarConvert<int, Acctype>::to(v)), v2(ScalarConvert<int, Acctype>::to(v)) {}
+  __device__ Float2& operator+=(const Float2& a) {
+    v1 += a.v1;
+    v2 += a.v2;
+    return *this;
+  }
+};
+
+template <typename DType, typename Acctype>
+static __device__ __forceinline__ Float2<DType, Acctype> warpSum(Float2<DType, Acctype> value) {
+  value.v1 = warpSum(value.v1);
+  value.v2 = warpSum(value.v2);
+  return value;
+}
+
--- a/encoding/lib/gpu/device_tensor.h
+++ b/encoding/lib/gpu/device_tensor.h
+#include <ATen/ATen.h>
+
+template<typename DType, int Dim>
+struct DeviceTensor {
+ public:
+  inline __device__ __host__ DeviceTensor(DType *p, const int *size)
+    : dptr_(p) {
+    for (int i = 0; i < Dim; ++i) {
+      size_[i] = size ? size[i] : 0;
+    }
+  }
+
+  inline __device__ __host__ unsigned getSize(const int i) const {
+    assert(i < Dim);
+    return size_[i];
+  }
+
+  inline __device__ __host__ int numElements() const {
+    int n = 1;
+    for (int i = 0; i < Dim; ++i) {
+      n *= size_[i];
+    }
+    return n;
+  }
+
+  inline __device__ __host__ DeviceTensor<DType, Dim-1> select(const size_t x) const {
+    assert(Dim > 1);
+    int offset = x;
+    for (int i = 1; i < Dim; ++i) {
+      offset *= size_[i];
+    }
+    DeviceTensor<DType, Dim-1> tensor(dptr_ + offset, nullptr);
+    for (int i = 0; i < Dim - 1; ++i) {
+      tensor.size_[i] = this->size_[i+1];
+    }
+    return tensor;
+  }
+
+  inline __device__ __host__ DeviceTensor<DType, Dim-1> operator[](const size_t x) const {
+    assert(Dim > 1);
+    int offset = x;
+    for (int i = 1; i < Dim; ++i) {
+      offset *= size_[i];
+    }
+    DeviceTensor<DType, Dim-1> tensor(dptr_ + offset, nullptr);
+    for (int i = 0; i < Dim - 1; ++i) {
+      tensor.size_[i] = this->size_[i+1];
+    }
+    return tensor;
+  }
+
+  inline __device__ __host__ size_t InnerSize() const {
+    assert(Dim >= 3);
+    size_t sz = 1;
+    for (size_t i = 2; i < Dim; ++i) {
+      sz *= size_[i];
+    }
+    return sz;
+  }
+
+  inline __device__ __host__ size_t ChannelCount() const {
+    assert(Dim >= 3);
+    return size_[1];
+  }
+
+  inline __device__ __host__ DType* data_ptr() const {
+    return dptr_;
+  }
+
+  DType *dptr_;
+  int size_[Dim];
+};
+
+template<typename DType>
+struct DeviceTensor<DType, 1> {
+  inline __device__ __host__ DeviceTensor(DType *p, const int *size)
+    : dptr_(p) {
+    size_[0] = size ? size[0] : 0;
+  }
+
+  inline __device__ __host__ unsigned getSize(const int i) const {
+    assert(i == 0);
+    return size_[0];
+  }
+
+  inline __device__ __host__ int numElements() const {
+    return size_[0];
+  }
+
+  inline __device__ __host__ DType &operator[](const size_t x) const {
+      return *(dptr_ + x);
+  }
+
+  inline __device__ __host__ DType* data_ptr() const {
+    return dptr_;
+  }
+
+  DType *dptr_;
+  int size_[1];
+};
+
+template<typename DType, int Dim>
+static DeviceTensor<DType, Dim> devicetensor(const at::Tensor &blob) {
+  DType *data = blob.data<DType>();
+  DeviceTensor<DType, Dim> tensor(data, nullptr);
+  for (int i = 0; i < Dim; ++i) {
+    tensor.size_[i] = blob.size(i);
+  }
+  return tensor;
+}
--- a/encoding/lib/gpu/encoding_kernel.cu
+++ b/encoding/lib/gpu/encoding_kernel.cu
+#include <ATen/ATen.h>
+#include <vector>
+
+#include "common.h"
+#include "device_tensor.h"
+
+namespace {
+
+template<typename DType, typename Acctype>
+struct AggOp {
+  __device__ AggOp(DeviceTensor<DType, 3> a,
+                   DeviceTensor<DType, 3> x,
+                   DeviceTensor<DType, 2> c) : A(a), X(x), C(c) {}
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
+    return ScalarConvert<DType, Acctype>::to(A[b][i][k] * (X[b][i][d] - C[k][d]));
+  }
+  DeviceTensor<DType, 3> A;
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+};
+
+template<typename DType, typename Acctype>
+struct AggBackOp {
+  __device__ AggBackOp(DeviceTensor<DType, 3> g,
+                       DeviceTensor<DType, 3> x,
+                       DeviceTensor<DType, 2> c) : G(g), X(x), C(c) {}
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
+    return ScalarConvert<DType, Acctype>::to(G[b][k][d] * (X[b][i][d] - C[k][d]));
+  }
+  DeviceTensor<DType, 3> G;
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+};
+
+template<typename DType, typename Acctype>
+struct SL2Op {
+  __device__ SL2Op(DeviceTensor<DType, 3> x,
+                   DeviceTensor<DType, 2> c) : X(x), C(c) {}
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) 
+  {
+      DType r = X[b][i][d] - C[k][d];
+      return ScalarConvert<DType, Acctype>::to(r * r);
+  }
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+};
+
+template<typename DType, typename Acctype>
+struct SL2GradXOp {
+  __device__ SL2GradXOp(
+    DeviceTensor<DType, 3> gsl,
+    DeviceTensor<DType, 3> x,
+    DeviceTensor<DType, 2> c,
+    DeviceTensor<DType, 1> s
+  ) : GSL(gsl), X(x), C(c), S(s) {}
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) 
+  {
+    return ScalarConvert<DType, Acctype>::to(
+      2 * S[k] * GSL[b][i][k] * (X[b][i][d]-C[k][d]));
+  }
+  DeviceTensor<DType, 3> GSL;
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+  DeviceTensor<DType, 1> S;
+};
+
+
+template<typename T, typename Op>
+__device__ T reduceN(
+    Op op, int b, int k, int d, int N) {
+  T sum = 0;
+  for (int x = threadIdx.x; x < N; x += blockDim.x) {
+      sum += op(b,x,k,d);
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+      if (threadIdx.x / WARP_SIZE < 32) {
+              shared[threadIdx.x / WARP_SIZE] = sum;
+      }
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+      // zero out the other entries in shared
+      shared[threadIdx.x] = (T) 0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+      sum = warpSum(shared[threadIdx.x]);
+      if (threadIdx.x == 0) {
+          shared[0] = sum;
+      }
+  }
+  __syncthreads();
+
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+
+template<typename T, typename Op>
+__device__ T reduceD(
+    Op op, int b, int i, int k, int D) {
+  T sum = 0;
+  for (int x = threadIdx.x; x < D; x += blockDim.x) {
+      sum += op(b,i,k,x);
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+      if (threadIdx.x / WARP_SIZE < 32) {
+              shared[threadIdx.x / WARP_SIZE] = sum;
+      }
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+      // zero out the other entries in shared
+      shared[threadIdx.x] = (T) 0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+      sum = warpSum(shared[threadIdx.x]);
+      if (threadIdx.x == 0) {
+          shared[0] = sum;
+      }
+  }
+  __syncthreads();
+
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+
+template<typename T, typename Op>
+__device__ T reduceK(
+    Op op, int b, int i, int d, int K) {
+  T sum = 0;
+  for (int x = threadIdx.x; x < K; x += blockDim.x) {
+    sum += op(b,i,x,d);
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    if (threadIdx.x / WARP_SIZE < 32) {
+            shared[threadIdx.x / WARP_SIZE] = sum;
+    }
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T) 0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+
+template<typename T, typename Op>
+__device__ T reduceBN(
+    Op op, 
+    int k, int d, int B, int N) {
+  T sum = 0;
+  for (int batch = 0; batch < B; ++batch) {
+    for (int x = threadIdx.x; x < N; x += blockDim.x) {
+        sum += op(batch,x,k,d);
+    }
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    if (threadIdx.x / WARP_SIZE < 32) {
+            shared[threadIdx.x / WARP_SIZE] = sum;
+    }
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T) 0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+
+template<typename DType, typename Acctype>
+__global__ void Aggregate_Forward_kernel (
+    DeviceTensor<DType, 3> E,
+    DeviceTensor<DType, 3> A,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C) {
+  /* declarations of the variables */
+  int b, k, d, N;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  d = blockIdx.x;
+  k = blockIdx.y;
+  N = X.getSize(1);
+  /* main operation */
+  AggOp<DType, Acctype> g(A,X,C);
+  E[b][k][d] = reduceN<Acctype>(g, b, k, d, N);
+}
+
+template<typename DType, typename Acctype>
+__global__ void Aggregate_Backward_kernel (
+    DeviceTensor<DType, 3> GA,
+    DeviceTensor<DType, 3> GE,
+    DeviceTensor<DType, 3> A,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C) {
+  /* declarations of the variables */
+  int b, k, i, D;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  i = blockIdx.y;
+  k = blockIdx.x;
+  D = GE.getSize(2);
+  /* main operation */
+  AggBackOp<DType, Acctype> g(GE,X,C);
+  GA[b][i][k] = reduceD<Acctype>(g, b, i, k, D);
+}
+
+template<typename DType, typename Acctype>
+__global__ void ScaledL2_Forward_kernel (
+    DeviceTensor<DType, 3> SL,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C,
+    DeviceTensor<DType, 1> S) {
+  /* declarations of the variables */
+  int b, k, i, D;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  k = blockIdx.x;
+  i = blockIdx.y;
+  D = X.getSize(2);
+  /* main operation */
+  SL2Op<DType, Acctype> g(X,C);
+  SL[b][i][k] = S[k] * reduceD<Acctype>(g,b,i,k,D);;
+}
+
+template<typename DType, typename Acctype>
+__global__ void ScaledL2_GradX_kernel (
+    DeviceTensor<DType, 3> GSL,
+    DeviceTensor<DType, 3> GX,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C,
+    DeviceTensor<DType, 1> S) {
+  /* declarations of the variables */
+  int b, d, i, K;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  d = blockIdx.x;
+  i = blockIdx.y;
+  K = C.getSize(0);
+  /* main operation */
+  SL2GradXOp<DType, Acctype> g(GSL,X,C,S);
+  GX[b][i][d] = reduceK<Acctype>(g,b,i,d,K);
+}
+
+template<typename DType, typename Acctype>
+__global__ void ScaledL2_GradC_kernel (
+    DeviceTensor<DType, 3> GSL,
+    DeviceTensor<DType, 2> GC,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C,
+    DeviceTensor<DType, 1> S) {
+  /* declarations of the variables */
+  int k, d, B, N;
+  /* Get the index and channels */ 
+  d = blockIdx.x;
+  k = blockIdx.y;
+  B = X.getSize(0);
+  N = X.getSize(1);
+  /* main operation */
+  SL2GradXOp<DType, Acctype> g(GSL,X,C,S);
+  GC[k][d] = - reduceBN<Acctype>(g, k, d, B, N);
+}
+
+}// namespace
+
+at::Tensor Aggregate_Forward_CUDA(
+    const at::Tensor A_,
+    const at::Tensor X_,
+    const at::Tensor C_) {
+  /* Device tensors */
+  auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_(); 
+  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  // B, K, D
+  dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
+  dim3 threads(getNumThreads(X_.size(1)));
+
+  AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Forward_CUDA", ([&] {
+    DeviceTensor<scalar_t, 3> E = devicetensor<scalar_t, 3>(E_);
+    DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    /* kernel function */
+    Aggregate_Forward_kernel<scalar_t, scalar_t>
+      <<<blocks, threads, 0, stream>>>(E, A, X, C);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return E_;
+}
+
+std::vector<at::Tensor> Aggregate_Backward_CUDA(
+    const at::Tensor GE_,
+    const at::Tensor A_,
+    const at::Tensor X_,
+    const at::Tensor C_) {
+  auto gradA_ = at::zeros_like(A_);
+  auto gradX_ = at::bmm(A_, GE_);
+  auto gradC_ = (-GE_ * A_.sum(1).unsqueeze(2)).sum(0);
+  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  // B, K, D
+  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
+  dim3 threads(getNumThreads(C_.size(1)));
+  AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Backward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> GA = devicetensor<scalar_t, 3>(gradA_);
+    DeviceTensor<scalar_t, 3> GE = devicetensor<scalar_t, 3>(GE_);
+    DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    Aggregate_Backward_kernel<scalar_t, scalar_t>
+      <<<blocks, threads, 0, stream>>> (GA, GE, A, X, C);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return {gradA_, gradX_, gradC_};
+}
+
+at::Tensor ScaledL2_Forward_CUDA(
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor S_) {
+  auto SL_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
+  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
+  dim3 threads(getNumThreads(C_.size(1)));
+
+  AT_DISPATCH_FLOATING_TYPES(X_.type(), "ScaledL2_Forward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> SL = devicetensor<scalar_t, 3>(SL_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    DeviceTensor<scalar_t, 1> S = devicetensor<scalar_t, 1>(S_);
+    /* kernel function */
+    ScaledL2_Forward_kernel<scalar_t, scalar_t>
+      <<<blocks, threads, 0, stream>>> (SL, X, C, S);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return SL_;
+}
+
+std::vector<at::Tensor> ScaledL2_Backward_CUDA(
+    const at::Tensor GSL_,
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor S_,
+    const at::Tensor SL_) {
+  auto GX_ = at::zeros_like(X_);
+  auto GC_ = at::zeros_like(C_);
+  /* kernel function */
+  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
+  dim3 threads1(getNumThreads(C_.size(0)));
+  dim3 blocks2(C_.size(1), C_.size(0));
+  dim3 threads2(getNumThreads(X_.size(1)));
+  //std::vector<int> size{ 1, 1, K};
+  //auto GS_ = GSL_ * (SL_ / at::_unsafe_view(S_, size))
+  auto GS_ = (GSL_ * (SL_ / S_.view({1, 1, C_.size(0)}))).sum(0).sum(0);
+  AT_DISPATCH_FLOATING_TYPES(X_.type(), "ScaledL2_Backward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> GSL = devicetensor<scalar_t, 3>(GSL_);
+    DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
+    DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    DeviceTensor<scalar_t, 1> S = devicetensor<scalar_t, 1>(S_);
+    ScaledL2_GradX_kernel<scalar_t, scalar_t>
+      <<<blocks1, threads1, 0, stream>>> (GSL, GX, X, C, S);
+    AT_ASSERT(cudaGetLastError() == cudaSuccess);
+    ScaledL2_GradC_kernel<scalar_t, scalar_t>
+      <<<blocks2, threads2, 0, stream>>> (GSL, GC, X, C, S);
+    AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  }));
+  return {GX_, GC_, GS_};
+}
--- a/encoding/lib/gpu/operator.cpp
+++ b/encoding/lib/gpu/operator.cpp
+#include "operator.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("roi_align_forward", &ROIAlignForwardCUDA, "ROI Align forward (CUDA)");
+  m.def("roi_align_backward", &ROIAlignBackwardCUDA, "ROI Align backward (CUDA)");
+  m.def("aggregate_forward", &Aggregate_Forward_CUDA, "Aggregate forward (CUDA)");
+  m.def("aggregate_backward", &Aggregate_Backward_CUDA, "Aggregate backward (CUDA)");
+  m.def("scaled_l2_forward", &ScaledL2_Forward_CUDA, "ScaledL2 forward (CUDA)");
+  m.def("scaled_l2_backward", &ScaledL2_Backward_CUDA, "ScaledL2 backward (CUDA)");
+  m.def("batchnorm_forward", &BatchNorm_Forward_CUDA, "BatchNorm forward (CUDA)");
+  m.def("batchnorm_backward", &BatchNorm_Backward_CUDA, "BatchNorm backward (CUDA)");
+  m.def("sumsquare_forward", &Sum_Square_Forward_CUDA, "SumSqu forward (CUDA)");
+  m.def("sumsquare_backward", &Sum_Square_Backward_CUDA, "SumSqu backward (CUDA)");
+}
--- a/encoding/lib/gpu/operator.h
+++ b/encoding/lib/gpu/operator.h
+#include <torch/torch.h>
+#include <vector>
+
+at::Tensor ROIAlignForwardCUDA(
+  const at::Tensor input,
+  const at::Tensor rois,
+  int64_t pooled_height,
+  int64_t pooled_width,
+  double spatial_scale,
+  int64_t sample_ratio);
+
+at::Tensor ROIAlignBackwardCUDA(
+  const at::Tensor rois,
+  const at::Tensor grad_output,
+  int64_t b_size,
+  int64_t channels,
+  int64_t height,
+  int64_t width,
+  int64_t pooled_height,
+  int64_t pooled_width,
+  double spatial_scale,
+  int64_t sampling_ratio);
+
+at::Tensor Aggregate_Forward_CUDA(
+  const at::Tensor A_,
+  const at::Tensor X_,
+  const at::Tensor C_);
+
+std::vector<at::Tensor> Aggregate_Backward_CUDA(
+  const at::Tensor GE_,
+  const at::Tensor A_,
+  const at::Tensor X_,
+  const at::Tensor C_);
+
+at::Tensor ScaledL2_Forward_CUDA(
+  const at::Tensor X_,
+  const at::Tensor C_,
+  const at::Tensor S_);
+
+std::vector<at::Tensor> ScaledL2_Backward_CUDA(
+  const at::Tensor GSL_,
+  const at::Tensor X_,
+  const at::Tensor C_,
+  const at::Tensor S_,
+  const at::Tensor SL_);
+
+at::Tensor BatchNorm_Forward_CUDA(
+  const at::Tensor input_, 
+  const at::Tensor mean_,
+  const at::Tensor std_,
+  const at::Tensor gamma_,
+  const at::Tensor beta_);
+
+std::vector<at::Tensor> BatchNorm_Backward_CUDA(
+  const at::Tensor gradoutput_,
+  const at::Tensor input_,
+  const at::Tensor mean_, 
+  const at::Tensor std_,
+  const at::Tensor gamma_,
+  const at::Tensor beta_, 
+  bool train);
+
+std::vector<at::Tensor> Sum_Square_Forward_CUDA(
+  const at::Tensor input_);
+
+at::Tensor Sum_Square_Backward_CUDA(
+  const at::Tensor input_,
+  const at::Tensor gradSum_,
+  const at::Tensor gradSquare_);
--- a/encoding/lib/gpu/roi_align_kernel.cu
+++ b/encoding/lib/gpu/roi_align_kernel.cu
+#include <ATen/ATen.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                 \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+// The number of cuda threads to use. 512 is used for backward compatibility
+constexpr int ROI_CUDA_NUM_THREADS = 512;
+
+// The maximum number of blocks to use in the default kernel call.
+constexpr int ROI_MAXIMUM_NUM_BLOCKS = 4096;
+
+/**
+ * @brief Compute the number of blocks needed to run N threads.
+ */
+inline int ROI_GET_BLOCKS(const int N) {
+  return std::max(
+      std::min(
+          (N + ROI_CUDA_NUM_THREADS - 1) / ROI_CUDA_NUM_THREADS,
+          ROI_MAXIMUM_NUM_BLOCKS),
+      // Use at least 1 block, since CUDA does not allow empty block
+      1);
+}
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* bottom_data,
+    const int height,
+    const int width,
+    T y,
+    T x,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignForwardKernel(
+    const int nthreads,
+    const T* bottom_data,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T* w1,
+    T* w2,
+    T* w3,
+    T* w4,
+    int* x_low,
+    int* x_high,
+    int* y_low,
+    int* y_high,
+    const int /*index*/ /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    *w1 = *w2 = *w3 = *w4 = 0.;
+    *x_low = *x_high = *y_low = *y_high = -1;
+    return;
+  }
+
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+
+  *y_low = static_cast<int>(y);
+  *x_low = static_cast<int>(x);
+
+  if (*y_low >= height - 1) {
+    *y_high = *y_low = height - 1;
+    y = (T)*y_low;
+  } else {
+    *y_high = *y_low + 1;
+  }
+
+  if (*x_low >= width - 1) {
+    *x_high = *x_low = width - 1;
+    x = (T)*x_low;
+  } else {
+    *x_high = *x_low + 1;
+  }
+
+  T ly = y - *y_low;
+  T lx = x - *x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+
+  return;
+}
+
+template <typename T>
+inline __device__ T gpu_atomic_add(const T val, T* address);
+
+template <>
+inline __device__ float gpu_atomic_add(const float val, float* address) {
+  return atomicAdd(address, val);
+}
+
+template <>
+inline __device__ double gpu_atomic_add(const double val, double* address) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull;
+  unsigned long long int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+  } while (assumed != old);
+  return val;
+}
+
+template <typename T>
+__global__ void RoIAlignBackwardKernel(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            &w1,
+            &w2,
+            &w3,
+            &w4,
+            &x_low,
+            &x_high,
+            &y_low,
+            &y_high,
+            index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          /*
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+          */
+          gpu_atomic_add(
+              static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
+          gpu_atomic_add(
+              static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
+          gpu_atomic_add(
+              static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
+        }  // if
+      }  // ix
+    }  // iy
+  }  // CUDA_1D_KERNEL_LOOP
+}  // RoIAlignBackward
+} // namespace
+
+
+at::Tensor ROIAlignForwardCUDA(
+    const at::Tensor input,
+    const at::Tensor rois,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    double spatial_scale,
+    int64_t sampling_ratio) {
+
+  AT_ASSERT(input.is_contiguous());
+  AT_ASSERT(rois.is_contiguous());
+  AT_ASSERT(input.ndimension() == 4);
+  AT_ASSERT(rois.ndimension() == 2);
+  AT_ASSERT(rois.size(1) == 5);
+
+  auto proposals = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  // Output Tensor is (num_rois, C, pooled_height, pooled_width)
+  auto output = input.type().tensor({proposals, channels, pooled_height, pooled_width});
+
+  auto count = output.numel();
+  
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlignForwardCUDA", ([&] {
+    RoIAlignForwardKernel<scalar_t>
+      <<<ROI_GET_BLOCKS(count),
+         ROI_CUDA_NUM_THREADS,
+         0,
+         at::globalContext().getCurrentCUDAStream()>>>(
+          count,
+          input.data<scalar_t>(),
+          static_cast<scalar_t>(spatial_scale),
+          channels,
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          sampling_ratio,
+          rois.data<scalar_t>(),
+          output.data<scalar_t>());
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return output;
+}
+
+at::Tensor ROIAlignBackwardCUDA(
+    const at::Tensor rois,
+    const at::Tensor grad_output,
+    int64_t b_size,
+    int64_t channels,
+    int64_t height,
+    int64_t width,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    double spatial_scale,
+    int64_t sampling_ratio) {
+
+  AT_ASSERT(rois.is_contiguous());
+  AT_ASSERT(rois.ndimension() == 2);
+  AT_ASSERT(rois.size(1) == 5);
+
+  auto roi_cols = rois.size(1);
+  AT_ASSERT(roi_cols == 4 || roi_cols == 5);
+
+  // Output Tensor is (num_rois, C, pooled_height, pooled_width)
+  // gradient wrt input features
+  auto grad_in = rois.type().tensor({b_size, channels, height, width}).zero_(); 
+  auto num_rois = rois.size(0);
+  auto count = grad_output.numel();
+
+  AT_DISPATCH_FLOATING_TYPES(rois.type(), "ROIAlignBackwardCUDA", ([&] {
+    RoIAlignBackwardKernel<scalar_t>
+      <<<ROI_GET_BLOCKS(count),
+         ROI_CUDA_NUM_THREADS,
+         0,
+         at::globalContext().getCurrentCUDAStream()>>>(
+          count,
+          grad_output.data<scalar_t>(),
+          num_rois,
+          static_cast<scalar_t>(spatial_scale),
+          channels,
+          height,
+          width,
+          pooled_height,
+          pooled_width,
+          sampling_ratio,
+          grad_in.data<scalar_t>(),
+          rois.data<scalar_t>());
+  }));
+
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return grad_in;
+}
--- a/encoding/lib/gpu/setup.py
+++ b/encoding/lib/gpu/setup.py
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='enclib_gpu',
+    ext_modules=[
+        CUDAExtension('enclib_gpu', [
+            'operator.cpp',
+            'encoding_kernel.cu',
+            'syncbn_kernel.cu',
+            'roi_align_kernel.cu',
+            ]),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
--- a/encoding/lib/gpu/syncbn_kernel.cu
+++ b/encoding/lib/gpu/syncbn_kernel.cu
+#include <ATen/ATen.h>
+#include <vector>
+
+#include "common.h"
+#include "device_tensor.h"
+
+namespace {
+
+template <typename DType, typename Acctype, typename DeviceTensor3>
+struct GradOp {
+  __device__ GradOp(Acctype m, const DeviceTensor3 i, const DeviceTensor3 g)
+    : mean(m), input(i), gradOutput(g) {}
+  __device__ __forceinline__ Float2<DType, Acctype> operator()(int batch, int plane, int n) {
+    DType g = gradOutput[batch][plane][n];
+    DType c = ScalarConvert<Acctype, DType>::to(input[batch][plane][n] - mean);
+    return Float2<DType, Acctype>(g, g * c);
+  }
+  const Acctype mean;
+  const DeviceTensor3 input;
+  const DeviceTensor3 gradOutput;
+};
+
+template <typename DType, typename Acctype>
+struct SumOp {
+  __device__ SumOp(DeviceTensor<DType, 3> i) : input(i){}
+  __device__ __forceinline__ Float2<DType, Acctype> operator()(int batch, int plane, int n) {
+    DType g = input[batch][plane][n];
+    return Float2<DType, Acctype>(g, g * g);
+  }
+  DType mean;
+  DeviceTensor<DType, 3> input;
+};
+
+// Sum across (batch, x/y/z) applying Op() pointwise
+template<typename T, typename Op, typename DeviceTensor3>
+__device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
+  T sum = (T)0;
+  for (int batch = 0; batch < tensor.getSize(0); ++batch) {
+    for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
+      sum += op(batch, plane, x);
+    }
+  }
+
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    shared[threadIdx.x / WARP_SIZE] = sum;
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T)0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+
+template <typename DType>
+__global__ void BatchNorm_Forward_kernel (
+  DeviceTensor<DType, 3> output,
+  DeviceTensor<DType, 3> input,
+  DeviceTensor<DType, 1> mean,
+  DeviceTensor<DType, 1> std,
+  DeviceTensor<DType, 1> gamma,
+  DeviceTensor<DType, 1> beta) {
+  int c = blockIdx.x;
+  /* main operation */ 
+  for (int b = 0; b < input.getSize(0); ++b) {
+    for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
+      DType inp = input[b][c][x];
+      output[b][c][x] = gamma[c] * (inp - mean[c]) /
+        std[c] + beta[c];
+    }
+  }
+}
+
+template <typename DType>
+__global__ void BatchNorm_Backward_kernel (
+    DeviceTensor<DType, 3> gradoutput,
+    DeviceTensor<DType, 3> input,
+    DeviceTensor<DType, 3> gradinput,
+    DeviceTensor<DType, 1> gradgamma,
+    DeviceTensor<DType, 1> gradbeta,
+    DeviceTensor<DType, 1> mean,
+    DeviceTensor<DType, 1> std,
+    DeviceTensor<DType, 1> gamma,
+    DeviceTensor<DType, 1> beta,
+    DeviceTensor<DType, 1> gradMean, 
+    DeviceTensor<DType, 1> gradStd,
+    bool train) {
+  /* declarations of the variables */
+  /* Get the index and channels */ 
+  int c = blockIdx.x; 
+  /* main operation */ 
+  GradOp<DType, DType, DeviceTensor<DType, 3>> g(mean[c], input, gradoutput);
+  Float2<DType, DType> res = reduce<Float2<DType, DType>,
+    GradOp<DType, DType, DeviceTensor<DType, 3>>,
+    DeviceTensor<DType, 3>>(g, gradoutput, c);
+  DType gradOutputSum = res.v1;
+  DType dotP = res.v2;
+  DType invstd = DType(1.0) / std[c];
+  DType gradScale = invstd * gamma[c];
+  if (train && threadIdx.x == 0) {
+    gradMean[c] = - gradOutputSum * gamma[c] * invstd;
+    gradStd[c]  = - dotP * gamma[c] * invstd * invstd;
+  }
+  if (gradinput.numElements() > 0) {
+    for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
+      for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
+        gradinput[batch][c][x] = gradoutput[batch][c][x] * gradScale;
+      }
+    }
+  }
+  if (gradgamma.numElements() > 0) {
+    if (threadIdx.x == 0) {
+      gradgamma[c] += dotP * invstd;
+    }
+  }
+  if (gradbeta.numElements() > 0) {
+    if (threadIdx.x == 0) {
+      gradbeta[c] += gradOutputSum;
+    }
+  }
+}
+
+
+template <typename DType>
+__global__ void Sum_Square_Forward_kernel (
+    DeviceTensor<DType, 3> input,
+    DeviceTensor<DType, 1> sum,
+    DeviceTensor<DType, 1> square) {
+  int c = blockIdx.x;
+  /* main operation */ 
+  SumOp<DType, DType> g(input);
+  Float2<DType, DType> res = reduce<Float2<DType, DType>,
+    SumOp<DType, DType>, DeviceTensor<DType, 3>>(g, input, c);
+  DType xsum = res.v1;
+  DType xsquare = res.v2;
+  if (threadIdx.x == 0) {
+    sum[c] = xsum;
+    square[c] = xsquare;
+  }
+}
+
+template <typename DType>
+__global__ void Sum_Square_Backward_kernel (
+  DeviceTensor<DType, 3> gradInput,
+  DeviceTensor<DType, 3> input,
+  DeviceTensor<DType, 1> gradSum,
+  DeviceTensor<DType, 1> gradSquare) {
+  int c = blockIdx.x;
+  /* main operation */ 
+  for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
+    for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
+    {
+      gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
+          input[batch][c][x];
+    }
+  }   
+}
+
+} // namespcae
+
+at::Tensor BatchNorm_Forward_CUDA(
+    const at::Tensor input_, 
+    const at::Tensor mean_,
+    const at::Tensor std_,
+    const at::Tensor gamma_,
+    const at::Tensor beta_) {
+  auto output_ = at::zeros_like(input_);
+  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  dim3 blocks(input_.size(1));
+  dim3 threads(getNumThreads(input_.size(2)));
+  AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Forward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> output = devicetensor<scalar_t, 3>(output_);
+    DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
+    DeviceTensor<scalar_t, 1> mean = devicetensor<scalar_t, 1>(mean_);
+    DeviceTensor<scalar_t, 1> std = devicetensor<scalar_t, 1>(std_);
+    DeviceTensor<scalar_t, 1> gamma = devicetensor<scalar_t, 1>(gamma_);
+    DeviceTensor<scalar_t, 1> beta = devicetensor<scalar_t, 1>(beta_);
+    /* kernel function */
+    BatchNorm_Forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        output, input, mean, std, gamma, beta);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return output_;
+}
+
+std::vector<at::Tensor> BatchNorm_Backward_CUDA(
+    const at::Tensor gradoutput_,
+    const at::Tensor input_,
+    const at::Tensor mean_, 
+    const at::Tensor std_,
+    const at::Tensor gamma_,
+    const at::Tensor beta_, 
+    bool train) {
+  /* outputs*/
+  at::Tensor gradinput_ = at::zeros_like(input_);
+  at::Tensor gradgamma_ = at::zeros_like(gamma_);
+  at::Tensor gradbeta_ = at::zeros_like(beta_);
+  at::Tensor gradMean_ = at::zeros_like(mean_);
+  at::Tensor gradStd_ = at::zeros_like(std_);
+  /* cuda utils*/
+  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  dim3 blocks(input_.size(1));
+  dim3 threads(getNumThreads(input_.size(2)));
+  AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Backward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> gradoutput = devicetensor<scalar_t, 3>(gradoutput_);
+    DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
+    DeviceTensor<scalar_t, 3> gradinput = devicetensor<scalar_t, 3>(gradinput_);
+    DeviceTensor<scalar_t, 1> gradgamma = devicetensor<scalar_t, 1>(gradgamma_);
+    DeviceTensor<scalar_t, 1> gradbeta = devicetensor<scalar_t, 1>(gradbeta_);
+    DeviceTensor<scalar_t, 1> mean = devicetensor<scalar_t, 1>(mean_);
+    DeviceTensor<scalar_t, 1> std = devicetensor<scalar_t, 1>(std_);
+    DeviceTensor<scalar_t, 1> gamma = devicetensor<scalar_t, 1>(gamma_);
+    DeviceTensor<scalar_t, 1> beta = devicetensor<scalar_t, 1>(beta_);
+    DeviceTensor<scalar_t, 1> gradMean = devicetensor<scalar_t, 1>(gradMean_);
+    DeviceTensor<scalar_t, 1> gradStd = devicetensor<scalar_t, 1>(gradStd_);
+    /* kernel function */
+    BatchNorm_Backward_kernel<scalar_t>
+      <<<blocks, threads, 0, stream>>>(
+      gradoutput, input, gradinput, gradgamma, gradbeta, mean, std, 
+      gamma, beta, gradMean, gradStd, train);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return {gradinput_, gradMean_, gradStd_, gradgamma_, gradbeta_};
+}
+
+std::vector<at::Tensor> Sum_Square_Forward_CUDA(
+    const at::Tensor input_) {
+  /* outputs */
+  at::Tensor sum_ = input_.type().tensor({input_.size(1)}).zero_();
+  at::Tensor square_ = input_.type().tensor({input_.size(1)}).zero_();
+  /* cuda utils*/
+  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  dim3 blocks(input_.size(1));
+  dim3 threads(getNumThreads(input_.size(2)));
+  AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Backward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
+    DeviceTensor<scalar_t, 1> sum = devicetensor<scalar_t, 1>(sum_);
+    DeviceTensor<scalar_t, 1> square = devicetensor<scalar_t, 1>(square_);
+    /* kernel function */
+    Sum_Square_Forward_kernel<scalar_t>
+      <<<blocks, threads, 0, stream>>>(input, sum, square);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return {sum_, square_};
+}
+
+at::Tensor Sum_Square_Backward_CUDA(
+    const at::Tensor input_,
+    const at::Tensor gradSum_,
+    const at::Tensor gradSquare_) {
+  /* outputs */
+  at::Tensor gradInput_ = at::zeros_like(input_);
+  /* cuda utils*/
+  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  dim3 blocks(input_.size(1));
+  dim3 threads(getNumThreads(input_.size(2)));
+  AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Backward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> gradInput = devicetensor<scalar_t, 3>(gradInput_);
+    DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
+    DeviceTensor<scalar_t, 1> gradSum = devicetensor<scalar_t, 1>(gradSum_);
+    DeviceTensor<scalar_t, 1> gradSquare =devicetensor<scalar_t, 1>(gradSquare_);
+    /* kernel function */
+    Sum_Square_Backward_kernel<scalar_t>
+      <<<blocks, threads, 0, stream>>>(gradInput, input, gradSum, gradSquare);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return gradInput_;
+}
--- a/experiments/segmentation/demo.py
+++ b/experiments/segmentation/demo.py
+import torch
+import encoding
+
+# Get the model
+model = encoding.models.get_model('fcn_resnet50_ade', pretrained=True).cuda()
+model.eval()
+
+# Prepare the image
+url = 'https://github.com/zhanghang1989/image-data/blob/master/' + \
+      'encoding/segmentation/ade20k/ADE_val_00001142.jpg?raw=true'
+filename = 'example.jpg'
+img = encoding.utils.load_image(
+    encoding.utils.download(url, filename)).cuda().unsqueeze(0)
+
+# Make prediction
+output = model.evaluate(img)
+predict = torch.max(output, 1)[1].cpu().numpy() + 1
+
+# Get color pallete for visualization
+mask = encoding.utils.get_mask_pallete(predict, 'ade20k')
+mask.save('output.png')
--- a/experiments/segmentation/option.py
+++ b/experiments/segmentation/option.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+
+import os
+import argparse
+import torch
+
+class Options():
+    def __init__(self):
+        parser = argparse.ArgumentParser(description='PyTorch \
+            Segmentation')
+        # model and dataset 
+        parser.add_argument('--model', type=str, default='encnet',
+                            help='model name (default: encnet)')
+        parser.add_argument('--backbone', type=str, default='resnet50',
+                            help='backbone name (default: resnet50)')
+        parser.add_argument('--dataset', type=str, default='ade20k',
+                            help='dataset name (default: pascal12)')
+        parser.add_argument('--data-folder', type=str,
+                            default=os.path.join(os.environ['HOME'], 'data'),
+                            help='training dataset folder (default: \
+                            $(HOME)/data)')
+        parser.add_argument('--workers', type=int, default=4,
+                            metavar='N', help='dataloader threads')
+        # training hyper params
+        parser.add_argument('--aux', action='store_true', default= False,
+                            help='Auxilary Loss')
+        parser.add_argument('--se-loss', action='store_true', default= False,
+                            help='Semantic Encoding Loss SE-loss')
+        parser.add_argument('--epochs', type=int, default=None, metavar='N',
+                            help='number of epochs to train (default: auto)')
+        parser.add_argument('--start_epoch', type=int, default=0,
+                            metavar='N', help='start epochs (default:0)')
+        parser.add_argument('--batch-size', type=int, default=None,
+                            metavar='N', help='input batch size for \
+                            training (default: auto)')
+        parser.add_argument('--test-batch-size', type=int, default=16,
+                            metavar='N', help='input batch size for \
+                            testing (default: 32)')
+        parser.add_argument('--lr', type=float, default=None, metavar='LR',
+                            help='learning rate (default: auto)')
+        parser.add_argument('--lr-scheduler', type=str, default='poly', 
+                            help='learning rate scheduler (default: poly)')
+        parser.add_argument('--momentum', type=float, default=0.9,
+                            metavar='M', help='momentum (default: 0.9)')
+        parser.add_argument('--weight-decay', type=float, default=1e-4,
+                            metavar='M', help='w-decay (default: 1e-4)')
+        # cuda, seed and logging
+        parser.add_argument('--no-cuda', action='store_true', default=
+                            False, help='disables CUDA training')
+        parser.add_argument('--seed', type=int, default=1, metavar='S',
+                            help='random seed (default: 1)')
+        # checking point
+        parser.add_argument('--resume', type=str, default=None,
+                            help='put the path to resuming file if needed')
+        parser.add_argument('--checkname', type=str, default='default',
+                            help='set the checkpoint name')
+        parser.add_argument('--model-zoo', type=str, default=None,
+                            help='evaluating on model zoo model')
+        # finetuning pre-trained models
+        parser.add_argument('--ft', action='store_true', default= False,
+                            help='finetuning on a different dataset')
+        parser.add_argument('--pre-class', type=int, default=None,
+                            help='num of pre-trained classes \
+                            (default: None)')
+        # evaluation option
+        parser.add_argument('--eval', action='store_true', default= False,
+                            help='evaluating mIoU')
+        parser.add_argument('--no-val', action='store_true', default= False,
+                            help='skip validation during training')
+        # test option
+        parser.add_argument('--test-folder', type=str, default=None,
+                            help='path to test image folder')
+        # the parser
+        self.parser = parser
+
+    def parse(self):
+        args = self.parser.parse_args()
+        args.cuda = not args.no_cuda and torch.cuda.is_available()
+        # default settings for epochs, batch_size and lr
+        if args.epochs is None:
+            epoches = {
+                'pascal_voc': 50,
+                'pascal_aug': 50,
+                'pcontext': 50,
+                'ade20k': 120,
+            }
+            args.epochs = epoches[args.dataset.lower()]
+        if args.batch_size is None:
+            args.batch_size = 4 * torch.cuda.device_count()
+        if args.lr is None:
+            lrs = {
+                'pascal_voc': 0.0001,
+                'pascal_aug': 0.001,
+                'pcontext': 0.001,
+                'ade20k': 0.01,
+            }
+            args.lr = lrs[args.dataset.lower()] / 16 * args.batch_size
+        return args
--- a/experiments/segmentation/test.py
+++ b/experiments/segmentation/test.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+
+import os
+import numpy as np
+from tqdm import tqdm
+
+import torch
+from torch.utils import data
+import torchvision.transforms as transform
+from torch.nn.parallel.scatter_gather import gather
+
+import encoding.utils as utils
+from encoding.nn import SegmentationLosses, BatchNorm2d
+from encoding.parallel import DataParallelModel, DataParallelCriterion
+from encoding.datasets import get_segmentation_dataset, test_batchify_fn
+from encoding.models import get_model, get_segmentation_model, MultiEvalModule
+
+from option import Options
+
+torch_ver = torch.__version__[:3]
+if torch_ver == '0.3':
+    from torch.autograd import Variable
+
+def test(args):
+    # output folder
+    outdir = 'outdir'
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    # data transforms
+    input_transform = transform.Compose([
+        transform.ToTensor(),
+        transform.Normalize([.485, .456, .406], [.229, .224, .225])])
+    # dataset
+    if args.eval:
+        testset = get_segmentation_dataset(args.dataset, split='val', mode='testval',
+                                           transform=input_transform)
+    else:
+        testset = get_segmentation_dataset(args.dataset, split='test', mode='test',
+                                           transform=input_transform)
+    # dataloader
+    kwargs = {'num_workers': args.workers, 'pin_memory': True} \
+        if args.cuda else {}
+    test_data = data.DataLoader(testset, batch_size=args.batch_size,
+                                drop_last=False, shuffle=False,
+                                collate_fn=test_batchify_fn, **kwargs)
+    # model
+    if args.model_zoo is not None:
+        model = get_model(args.model_zoo, pretrained=True)
+    else:
+        model = get_segmentation_model(args.model, dataset=args.dataset,
+                                       backbone = args.backbone, aux = args.aux,
+                                       se_loss = args.se_loss, norm_layer = BatchNorm2d)
+        # resuming checkpoint
+        if args.resume is None or not os.path.isfile(args.resume):
+            raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume))
+        checkpoint = torch.load(args.resume)
+        # strict=False, so that it is compatible with old pytorch saved models
+        model.load_state_dict(checkpoint['state_dict'], strict=False)
+        print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
+
+    print(model)
+    evaluator = MultiEvalModule(model, testset.num_class).cuda()
+    evaluator.eval()
+
+    tbar = tqdm(test_data)
+    def eval_batch(image, dst, evaluator, eval_mode):
+        if eval_mode:
+            # evaluation mode on validation set
+            targets = dst
+            outputs = evaluator.parallel_forward(image)
+            batch_inter, batch_union, batch_correct, batch_label = 0, 0, 0, 0
+            for output, target in zip(outputs, targets):
+                correct, labeled = utils.batch_pix_accuracy(output.data.cpu(), target)
+                inter, union = utils.batch_intersection_union(
+                    output.data.cpu(), target, testset.num_class)
+                batch_correct += correct
+                batch_label += labeled
+                batch_inter += inter
+                batch_union += union
+            return batch_correct, batch_label, batch_inter, batch_union
+        else:
+            # test mode, dump the results
+            im_paths = dst
+            outputs = evaluator.parallel_forward(image)
+            predicts = [torch.max(output, 1)[1].cpu().numpy() + testset.pred_offset
+                        for output in outputs]
+            for predict, impath in zip(predicts, im_paths):
+                mask = utils.get_mask_pallete(predict, args.dataset)
+                outname = os.path.splitext(impath)[0] + '.png'
+                mask.save(os.path.join(outdir, outname))
+            # dummy outputs for compatible with eval mode
+            return 0, 0, 0, 0
+
+    total_inter, total_union, total_correct, total_label = \
+        np.int64(0), np.int64(0), np.int64(0), np.int64(0)
+    for i, (image, dst) in enumerate(tbar):
+        if torch_ver == "0.3":
+            image = Variable(image, volatile=True)
+            correct, labeled, inter, union = eval_batch(image, dst, evaluator, args.eval)
+        else:
+            with torch.no_grad():
+                correct, labeled, inter, union = eval_batch(image, dst, evaluator, args.eval)
+        if args.eval:
+            total_correct += correct
+            total_label += labeled
+            total_inter += inter.astype('int64')
+            total_union += union.astype('int64')
+            pixAcc = np.float64(1.0) * total_correct / (np.spacing(1, dtype=np.float64) + total_label)
+            IoU = np.float64(1.0) * total_inter / (np.spacing(1, dtype=np.float64) + total_union)
+            mIoU = IoU.mean()
+            tbar.set_description(
+                'pixAcc: %.4f, mIoU: %.4f' % (pixAcc, mIoU))
+
+
+if __name__ == "__main__":
+    args = Options().parse()
+    torch.manual_seed(args.seed)
+    args.test_batch_size = torch.cuda.device_count()
+    test(args)
--- a/experiments/segmentation/test_models.py
+++ b/experiments/segmentation/test_models.py
+import importlib
+import torch
+import encoding
+from option import Options
+from torch.autograd import Variable
+
+if __name__ == "__main__":
+    args = Options().parse()
+    model = encoding.models.get_segmentation_model(args.model, dataset=args.dataset, aux=args.aux,
+                                                   se_loss=args.se_loss, norm_layer=torch.nn.BatchNorm2d)
+    print('Creating the model:')
+    
+    print(model)
+    model.cuda()
+    x = Variable(torch.Tensor(4, 3, 480, 480)).cuda()
+    with torch.no_grad():
+        out = model(x)
+    for y in out:
+        print(y.size())
--- a/experiments/segmentation/train.py
+++ b/experiments/segmentation/train.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+
+import os
+import numpy as np
+from tqdm import tqdm
+
+import torch
+from torch.utils import data
+import torchvision.transforms as transform
+from torch.nn.parallel.scatter_gather import gather
+
+import encoding.utils as utils
+from encoding.nn import SegmentationLosses, BatchNorm2d
+from encoding.parallel import DataParallelModel, DataParallelCriterion
+from encoding.datasets import get_segmentation_dataset
+from encoding.models import get_segmentation_model
+
+from option import Options
+
+torch_ver = torch.__version__[:3]
+if torch_ver == '0.3':
+    from torch.autograd import Variable
+
+class Trainer():
+    def __init__(self, args):
+        self.args = args
+        # data transforms
+        input_transform = transform.Compose([
+            transform.ToTensor(),
+            transform.Normalize([.485, .456, .406], [.229, .224, .225])])
+        # dataset
+        trainset = get_segmentation_dataset(args.dataset, split='train',
+                                            transform=input_transform)
+        testset = get_segmentation_dataset(args.dataset, split='val',
+                                           transform=input_transform)
+        # dataloader
+        kwargs = {'num_workers': args.workers, 'pin_memory': True} \
+            if args.cuda else {}
+        self.trainloader = data.DataLoader(trainset, batch_size=args.batch_size,
+                                           drop_last=True, shuffle=True, **kwargs)
+        self.valloader = data.DataLoader(testset, batch_size=args.batch_size,
+                                         drop_last=False, shuffle=False, **kwargs)
+        self.nclass = trainset.num_class
+        # model
+        model = get_segmentation_model(args.model, dataset=args.dataset,
+                                       backbone = args.backbone, aux = args.aux,
+                                       se_loss = args.se_loss, norm_layer = BatchNorm2d)
+        print(model)
+        # optimizer using different LR
+        params_list = [{'params': model.pretrained.parameters(), 'lr': args.lr},]
+        if hasattr(model, 'head'):
+            params_list.append({'params': model.head.parameters(), 'lr': args.lr*10})
+        if hasattr(model, 'auxlayer'):
+            params_list.append({'params': model.auxlayer.parameters(), 'lr': args.lr*10})
+        optimizer = torch.optim.SGD(params_list, 
+                    lr=args.lr,
+                    momentum=args.momentum,
+                    weight_decay=args.weight_decay)
+        # resuming checkpoint
+        if args.resume is not None:
+            if not os.path.isfile(args.resume):
+                raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint['epoch']
+            model.load_state_dict(checkpoint['state_dict'])
+            if not args.ft:
+                optimizer.load_state_dict(checkpoint['optimizer'])
+            best_pred = checkpoint['best_pred']
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        # clear start epoch if fine-tuning
+        if args.ft:
+            args.start_epoch = 0
+        # criterions
+        self.criterion = SegmentationLosses(se_loss=args.se_loss, aux=args.aux, nclass=self.nclass)
+        self.model, self.optimizer = model, optimizer
+        # using cuda
+        if args.cuda:
+            self.model = DataParallelModel(self.model).cuda()
+            self.criterion = DataParallelCriterion(self.criterion).cuda()
+        # lr scheduler
+        self.scheduler = utils.LR_Scheduler(args, len(self.trainloader))
+        self.best_pred = 0.0
+
+    def training(self, epoch):
+        train_loss = 0.0
+        self.model.train()
+        tbar = tqdm(self.trainloader)
+        for i, (image, target) in enumerate(tbar):
+            self.scheduler(self.optimizer, i, epoch, self.best_pred)
+            self.optimizer.zero_grad()
+            if torch_ver == "0.3":
+                image = Variable(image)
+                target = Variable(target)
+            outputs = self.model(image)
+            loss = self.criterion(outputs, target)
+            loss.backward()
+            self.optimizer.step()
+            train_loss += loss.item()
+            tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1)))
+
+        # save checkpoint every epoch
+        is_best = False
+        utils.save_checkpoint({
+            'epoch': epoch + 1,
+            'state_dict': self.model.module.state_dict(),
+            'optimizer': self.optimizer.state_dict(),
+            'best_pred': self.best_pred,
+        }, self.args, is_best)
+
+    def validation(self, epoch):
+        # Fast test during the training
+        def eval_batch(image, target):
+            outputs = self.model(image)
+            outputs = gather(outputs, 0, dim=0)
+            pred = outputs[0]
+            target = target.cuda()
+            correct, labeled = utils.batch_pix_accuracy(pred.data, target)
+            inter, union = utils.batch_intersection_union(pred.data, target, self.nclass)
+            return correct, labeled, inter, union
+
+        is_best = False
+        self.model.eval()
+        total_inter, total_union, total_correct, total_label = 0, 0, 0, 0
+        tbar = tqdm(self.valloader, desc='\r')
+        for i, (image, target) in enumerate(tbar):
+            if torch_ver == "0.3":
+                image = Variable(image, volatile=True)
+                correct, labeled, inter, union = eval_batch(image, target)
+            else:
+                with torch.no_grad():
+                    correct, labeled, inter, union = eval_batch(image, target)
+
+            total_correct += correct
+            total_label += labeled
+            total_inter += inter
+            total_union += union
+            pixAcc = 1.0 * total_correct / (np.spacing(1) + total_label)
+            IoU = 1.0 * total_inter / (np.spacing(1) + total_union)
+            mIoU = IoU.mean()
+            tbar.set_description(
+                'pixAcc: %.3f, mIoU: %.3f' % (pixAcc, mIoU))
+
+        new_pred = (pixAcc + mIoU)/2
+        if new_pred > self.best_pred:
+            is_best = True
+            self.best_pred = new_pred
+            utils.save_checkpoint({
+                'epoch': epoch + 1,
+                'state_dict': self.model.module.state_dict(),
+                'optimizer': self.optimizer.state_dict(),
+                'best_pred': self.best_pred,
+            }, self.args, is_best)
+
+
+if __name__ == "__main__":
+    args = Options().parse()
+    torch.manual_seed(args.seed)
+    trainer = Trainer(args)
+    print('Starting Epoch:', args.start_epoch)
+    print('Total Epoches:', args.epochs)
+    for epoch in range(args.start_epoch, args.epochs):
+        trainer.training(epoch)
+        if not args.no_val:
+            trainer.validation(epoch)