v0.5.0 (#124)

fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/118 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/111 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/84 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/67 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/88

v0.5.0 (#124)
fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/118 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/111 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/84 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/67 fixes https://github.com/zhanghang1989/PyTorch-Encoding/issues/88
f8919197 · Hang Zhang · GitHub · d4e19553 · f8919197 · f8919197
Unverified Commit f8919197 authored Sep 26, 2018 by Hang Zhang Committed by GitHub Sep 26, 2018
20 changed files
--- a/encoding/lib/cpu/.ninja_deps
+++ b/encoding/lib/cpu/.ninja_deps
--- a/encoding/lib/cpu/.ninja_log
+++ b/encoding/lib/cpu/.ninja_log
+# ninja log v5
+1	1662	1531167904	encoding_cpu.o	6992bc0e0fec6bba
+1	1662	1531167904	syncbn_cpu.o	f23a0b2f19307f1b
+0	2948	1531167905	operator.o	44f004abdc4bd
--- a/encoding/lib/cpu/build.ninja
+++ b/encoding/lib/cpu/build.ninja
+ninja_required_version = 1.3
+cxx = c++
+cflags = -DTORCH_EXTENSION_NAME=enclib_cpu -I/anaconda3/lib/python3.6/site-packages/torch/lib/include -I/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH -I/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC -I/anaconda3/include/python3.6m -fPIC -std=c++11
+ldflags = -shared -undefined dynamic_lookup
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out
+  depfile = $out.d
+  deps = gcc
+rule link
+  command = $cxx $ldflags $in -o $out
+build operator.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/operator.cpp
+build encoding_cpu.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/encoding_cpu.cpp
+build syncbn_cpu.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/syncbn_cpu.cpp
+build roi_align_cpu.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/roi_align_cpu.cpp
+build nms_cpu.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/nms_cpu.cpp
+build enclib_cpu.so: link operator.o encoding_cpu.o syncbn_cpu.o roi_align_cpu.o nms_cpu.o
+default enclib_cpu.so
--- a/encoding/lib/cpu/encoding_cpu.cpp
+++ b/encoding/lib/cpu/encoding_cpu.cpp
+#include <ATen/ATen.h>
+#include <vector>
+at::Tensor Aggregate_Forward_CPU(
+    const at::Tensor A,
+    const at::Tensor X,
+    const at::Tensor C) {
+  auto E = (A.unsqueeze(3) * (X.unsqueeze(2).expand({X.size(0), X.size(1),
+    C.size(0), C.size(1)}) - C.unsqueeze(0).unsqueeze(0))).sum(1);
+  return E;
+}
+std::vector<at::Tensor> Aggregate_Backward_CPU(
+    const at::Tensor GE,
+    const at::Tensor A,
+    const at::Tensor X,
+    const at::Tensor C) {
+  auto gradA = (GE.unsqueeze(1) * (X.unsqueeze(2).expand({X.size(0), X.size(1),
+    C.size(0), C.size(1)}) - C.unsqueeze(0).unsqueeze(0))).sum(3);
+  auto gradX = at::bmm(A, GE);
+  auto gradC = (-GE * A.sum(1).unsqueeze(2)).sum(0);
+  return {gradA, gradX, gradC};
+}
+at::Tensor ScaledL2_Forward_CPU(
+    const at::Tensor X,
+    const at::Tensor C,
+    const at::Tensor S) {
+  auto SL = S.view({1, 1, C.size(0)}) * (X.unsqueeze(2).expand({X.size(0), X.size(1),
+    C.size(0), C.size(1)}) - C.unsqueeze(0).unsqueeze(0)).pow(2).sum(3);
+  return SL;
+}
+std::vector<at::Tensor> ScaledL2_Backward_CPU(
+    const at::Tensor GSL,
+    const at::Tensor X,
+    const at::Tensor C,
+    const at::Tensor S,
+    const at::Tensor SL) {
+  auto tmp = (2 * GSL * S.view({1, 1, C.size(0)})).unsqueeze(3) *
+    (X.unsqueeze(2).expand({X.size(0), X.size(1), C.size(0), C.size(1)}) -
+     C.unsqueeze(0).unsqueeze(0));
+  auto GX = tmp.sum(2);
+  auto GC = tmp.sum(0).sum(0);
+  auto GS = (GSL * (SL / S.view({1, 1, C.size(0)}))).sum(0).sum(0);
+  return {GX, GC, GS};
+}
--- a/encoding/lib/cpu/encoding_cpu.o
+++ b/encoding/lib/cpu/encoding_cpu.o
--- a/encoding/lib/cpu/nms_cpu.cpp
+++ b/encoding/lib/cpu/nms_cpu.cpp
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+template<typename scalar>
+inline scalar IoU(scalar* rawInput, int idx_x, int idx_y) {
+    scalar lr = std::fmin(rawInput[idx_x*4] + rawInput[idx_x*4+2],
+                         rawInput[idx_y*4] + rawInput[idx_y*4+2]);
+    scalar rl = std::fmax(rawInput[idx_x*4], rawInput[idx_y*4]);
+    scalar tb = std::fmin(rawInput[idx_x*4+1] + rawInput[idx_x*4+3],
+                         rawInput[idx_y*4+1] + rawInput[idx_y*4+3]);
+    scalar bt = std::fmax(rawInput[idx_x*4+1], rawInput[idx_y*4+1]);
+    scalar inter = std::fmax(0, lr-rl)*std::fmax(0, tb-bt);
+    scalar uni = (rawInput[idx_x*4+2]*rawInput[idx_x*4+3] 
+                 + rawInput[idx_y*4+2]*rawInput[idx_y*4+3] - inter);
+    return inter/uni;
+}
+std::vector<at::Tensor> Non_Max_Suppression_CPU(
+    const at::Tensor& input,
+    const at::Tensor& scores,
+    double thresh) {
+  AT_ASSERT(input.ndimension() == 3);
+  AT_ASSERT(scores.ndimension() == 2);
+  AT_ASSERT(input.size(0) == scores.size(0));
+  AT_ASSERT(input.size(1) == scores.size(1));
+  AT_ASSERT(input.size(2) == 4);
+  AT_ASSERT(input.is_contiguous());
+  AT_ASSERT(scores.is_contiguous());
+  AT_ASSERT(input.type().scalarType() == at::kFloat || input.type().scalarType() == at::kDouble)
+  AT_ASSERT(scores.type().scalarType() == at::kFloat || scores.type().scalarType() == at::kDouble)
+  AT_ASSERT(input.is_contiguous());
+  AT_ASSERT(scores.is_contiguous());
+  at::Tensor sorted_inds = std::get<1>(scores.sort(-1, true));
+  //at::Tensor rawIdx = std::get<1>(scores.sort(-1, true));
+  auto num_boxes = input.size(1);
+  auto batch_size = input.size(0);
+  auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
+  mask.fill_(1);
+  auto *rawMask = mask.data<unsigned char>();
+  auto *rawIdx = sorted_inds.data<int64_t>();
+  if (input.type().scalarType() == at::kFloat)
+  {
+    auto *rawInput = input.data<float>();
+    for(int batch=0; batch<batch_size; ++batch)
+    {
+      int pos=batch*num_boxes;
+      while(pos < (1+batch)*num_boxes-1)
+      {
+#pragma omp parallel for
+        for(int i=pos+1; i<num_boxes*(1+batch); ++i)
+        {
+          int idx_x = rawIdx[pos]+num_boxes*batch;
+          int idx_y = rawIdx[i]+num_boxes*batch;
+          if (IoU(rawInput, idx_x, idx_y) > thresh)
+            rawMask[i] = 0;
+        }
+        ++pos;
+        while(pos < (1+batch)*num_boxes-1 and (rawMask[pos] == 0))
+          ++pos;
+      }
+    }
+  }
+  else
+  {
+    auto *rawInput = input.data<double>();
+    for(int batch=0; batch<batch_size; ++batch)
+    {
+      int pos=batch*num_boxes;
+      while(pos < (1+batch)*num_boxes-1)
+      {
+#pragma omp parallel for
+        for(int i=pos+1; i<num_boxes*(1+batch); ++i)
+        {
+          int idx_x = rawIdx[pos]+num_boxes*batch;
+          int idx_y = rawIdx[i]+num_boxes*batch;
+          if (IoU(rawInput, idx_x, idx_y) > thresh)
+            rawMask[i] = 0;
+        }
+        ++pos;
+        while(pos < (1+batch)*num_boxes-1 and (rawMask[pos] == 0))
+          ++pos;
+      }
+    }
+  }
+  //see ./cuda/NonMaxSuppression.cu for comment about return value.
+  return {mask, sorted_inds};
+}
--- a/encoding/lib/cpu/operator.cpp
+++ b/encoding/lib/cpu/operator.cpp
+#include "operator.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("roi_align_forward", &ROIAlign_Forward_CPU, "ROI Align forward (CPU)");
+  m.def("roi_align_backward", &ROIAlign_Backward_CPU, "ROI Align backward (CPU)");
+  m.def("aggregate_forward", &Aggregate_Forward_CPU, "Aggregate forward (CPU)");
+  m.def("aggregate_backward", &Aggregate_Backward_CPU, "Aggregate backward (CPU)");
+  m.def("scaled_l2_forward", &ScaledL2_Forward_CPU, "ScaledL2 forward (CPU)");
+  m.def("scaled_l2_backward", &ScaledL2_Backward_CPU, "ScaledL2 backward (CPU)");
+  m.def("batchnorm_forward", &BatchNorm_Forward_CPU, "BatchNorm forward (CPU)");
+  m.def("batchnorm_backward", &BatchNorm_Backward_CPU, "BatchNorm backward (CPU)");
+  m.def("sumsquare_forward", &Sum_Square_Forward_CPU, "SumSqu forward (CPU)");
+  m.def("sumsquare_backward", &Sum_Square_Backward_CPU, "SumSqu backward (CPU)");
+  m.def("non_max_suppression", &Non_Max_Suppression_CPU, "NMS (CPU)");
+}
--- a/encoding/lib/cpu/operator.h
+++ b/encoding/lib/cpu/operator.h
+#include <torch/torch.h>
+#include <vector>
+at::Tensor ROIAlign_Forward_CPU(
+  const at::Tensor& input,
+  const at::Tensor& bottom_rois,
+  int64_t pooled_height,
+  int64_t pooled_width,
+  double spatial_scale,
+  int64_t sampling_ratio);
+at::Tensor ROIAlign_Backward_CPU(
+  const at::Tensor& bottom_rois,
+  const at::Tensor& grad_output,
+  int64_t b_size,
+  int64_t channels,
+  int64_t height,
+  int64_t width,
+  int64_t pooled_height,
+  int64_t pooled_width,
+  double spatial_scale,
+  int64_t sampling_ratio);
+at::Tensor Aggregate_Forward_CPU(
+    const at::Tensor A,
+    const at::Tensor X,
+    const at::Tensor C);
+std::vector<at::Tensor> Aggregate_Backward_CPU(
+    const at::Tensor GE,
+    const at::Tensor A,
+    const at::Tensor X,
+    const at::Tensor C);
+at::Tensor ScaledL2_Forward_CPU(
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor S_);
+std::vector<at::Tensor> ScaledL2_Backward_CPU(
+    const at::Tensor GSL_,
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor S_,
+    const at::Tensor SL_);
+at::Tensor BatchNorm_Forward_CPU(
+  const at::Tensor input_, 
+  const at::Tensor mean_,
+  const at::Tensor std_,
+  const at::Tensor gamma_,
+  const at::Tensor beta_);
+std::vector<at::Tensor> BatchNorm_Backward_CPU(
+  const at::Tensor gradoutput_,
+  const at::Tensor input_,
+  const at::Tensor mean_, 
+  const at::Tensor std_,
+  const at::Tensor gamma_,
+  const at::Tensor beta_, 
+  bool train);
+std::vector<at::Tensor> Sum_Square_Forward_CPU(
+  const at::Tensor input_);
+at::Tensor Sum_Square_Backward_CPU(
+  const at::Tensor input_,
+  const at::Tensor gradSum_,
+  const at::Tensor gradSquare_);
+std::vector<at::Tensor> Non_Max_Suppression_CPU(
+  const at::Tensor& input,
+  const at::Tensor& scores,
+  double thresh);
--- a/encoding/lib/cpu/operator.o
+++ b/encoding/lib/cpu/operator.o
--- a/encoding/lib/cpu/roi_align_cpu.cpp
+++ b/encoding/lib/cpu/roi_align_cpu.cpp
@@ -377,7 +377,7 @@ void ROIAlignBackwardCompute(
 }  // ROIAlignBackward
-at::Tensor ROIAlignForwardCPU(
+at::Tensor ROIAlign_Forward_CPU(
  const at::Tensor& input,
  const at::Tensor& bottom_rois,
  int64_t pooled_height,
@@ -409,7 +409,7 @@ at::Tensor ROIAlignForwardCPU(
  AT_ASSERT(input.is_contiguous());
  AT_ASSERT(bottom_rois.is_contiguous());
-  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlignForwardCPU", ([&] {
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_Forward_CPU", ([&] {
    ROIAlignForwardCompute<scalar_t>(
      output.numel(),
      input.data<scalar_t>(),
@@ -429,7 +429,7 @@ at::Tensor ROIAlignForwardCPU(
 }
-at::Tensor ROIAlignBackwardCPU(
+at::Tensor ROIAlign_Backward_CPU(
  const at::Tensor& bottom_rois,
  const at::Tensor& grad_output, // gradient of the output of the layer
  int64_t b_size,
@@ -455,7 +455,7 @@ at::Tensor ROIAlignBackwardCPU(
  AT_ASSERT(bottom_rois.is_contiguous());
-  AT_DISPATCH_FLOATING_TYPES(bottom_rois.type(), "ROIAlignBackwardCPU", ([&] {
+  AT_DISPATCH_FLOATING_TYPES(bottom_rois.type(), "ROIAlign_Backward_CPU", ([&] {
    ROIAlignBackwardCompute<scalar_t>(
      grad_output.numel(), 
      grad_output.data<scalar_t>(),

--- a/encoding/lib/cpu/setup.py
+++ b/encoding/lib/cpu/setup.py
@@ -5,8 +5,11 @@ setup(
    name='enclib_cpu',
    ext_modules=[
        CppExtension('enclib_cpu', [
-            'roi_align.cpp',
+            'operator.cpp',
            'roi_align_cpu.cpp',
+            'encoding_cpu.cpp',
+            'syncbn_cpu.cpp',
+            'nms_cpu.cpp',
            ]),
    ],
    cmdclass={

--- a/encoding/lib/cpu/syncbn_cpu.cpp
+++ b/encoding/lib/cpu/syncbn_cpu.cpp
+#include <ATen/ATen.h>
+#include <vector>
+at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
+  if (x.ndimension() == 2) {
+    return v;
+  } else {
+    std::vector<int64_t> broadcast_size = {1, -1};
+    for (int64_t i = 2; i < x.ndimension(); ++i)
+      broadcast_size.push_back(1);
+    return v.view(broadcast_size);
+  }
+}
+at::Tensor BatchNorm_Forward_CPU(
+    const at::Tensor input, 
+    const at::Tensor mean,
+    const at::Tensor std,
+    const at::Tensor gamma,
+    const at::Tensor beta) {
+  auto output = (input - broadcast_to(mean, input)) / broadcast_to(std, input);
+  output = output * broadcast_to(gamma, input) + broadcast_to(beta, input);
+  return output;
+}
+// Not implementing CPU backward for now
+std::vector<at::Tensor> BatchNorm_Backward_CPU(
+    const at::Tensor gradoutput,
+    const at::Tensor input,
+    const at::Tensor mean, 
+    const at::Tensor std,
+    const at::Tensor gamma,
+    const at::Tensor beta, 
+    bool train) {
+  /* outputs*/
+  at::Tensor gradinput = at::zeros_like(input);
+  at::Tensor gradgamma = at::zeros_like(gamma);
+  at::Tensor gradbeta = at::zeros_like(beta);
+  at::Tensor gradMean = at::zeros_like(mean);
+  at::Tensor gradStd = at::zeros_like(std);
+  return {gradinput, gradMean, gradStd, gradgamma, gradbeta};
+}
+std::vector<at::Tensor> Sum_Square_Forward_CPU(
+    const at::Tensor input) {
+  /* outputs */
+  at::Tensor sum = input.type().tensor({input.size(1)}).zero_();
+  at::Tensor square = input.type().tensor({input.size(1)}).zero_();
+  return {sum, square};
+}
+at::Tensor Sum_Square_Backward_CPU(
+    const at::Tensor input,
+    const at::Tensor gradSum,
+    const at::Tensor gradSquare) {
+  /* outputs */
+  at::Tensor gradInput = at::zeros_like(input);
+  return gradInput;
+}
--- a/encoding/lib/cpu/syncbn_cpu.o
+++ b/encoding/lib/cpu/syncbn_cpu.o
--- a/encoding/lib/gpu/common.h
+++ b/encoding/lib/gpu/common.h
@@ -77,3 +77,148 @@ static __device__ __forceinline__ Float2<DType, Acctype> warpSum(Float2<DType, A
  return value;
 }
+template<typename T, typename Op>
+__device__ T reduceD(
+    Op op, int b, int i, int k, int D) {
+  T sum = 0;
+  for (int x = threadIdx.x; x < D; x += blockDim.x) {
+      sum += op(b,i,k,x);
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+      if (threadIdx.x / WARP_SIZE < 32) {
+              shared[threadIdx.x / WARP_SIZE] = sum;
+      }
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+      // zero out the other entries in shared
+      shared[threadIdx.x] = (T) 0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+      sum = warpSum(shared[threadIdx.x]);
+      if (threadIdx.x == 0) {
+          shared[0] = sum;
+      }
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+template<typename T, typename Op>
+__device__ T reduceN(
+    Op op, int b, int k, int d, int N) {
+  T sum = 0;
+  for (int x = threadIdx.x; x < N; x += blockDim.x) {
+      sum += op(b,x,k,d);
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+      if (threadIdx.x / WARP_SIZE < 32) {
+              shared[threadIdx.x / WARP_SIZE] = sum;
+      }
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+      // zero out the other entries in shared
+      shared[threadIdx.x] = (T) 0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+      sum = warpSum(shared[threadIdx.x]);
+      if (threadIdx.x == 0) {
+          shared[0] = sum;
+      }
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+template<typename T, typename Op>
+__device__ T reduceK(
+    Op op, int b, int i, int d, int K) {
+  T sum = 0;
+  for (int x = threadIdx.x; x < K; x += blockDim.x) {
+    sum += op(b,i,x,d);
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    if (threadIdx.x / WARP_SIZE < 32) {
+            shared[threadIdx.x / WARP_SIZE] = sum;
+    }
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T) 0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
+template<typename T, typename Op>
+__device__ T reduceBN(
+    Op op, 
+    int k, int d, int B, int N) {
+  T sum = 0;
+  for (int batch = 0; batch < B; ++batch) {
+    for (int x = threadIdx.x; x < N; x += blockDim.x) {
+        sum += op(batch,x,k,d);
+    }
+  }
+  // sum over NumThreads within a warp
+  sum = warpSum(sum);
+  // 'transpose', and reduce within warp again
+  __shared__ T shared[32];
+  __syncthreads();
+  if (threadIdx.x % WARP_SIZE == 0) {
+    if (threadIdx.x / WARP_SIZE < 32) {
+            shared[threadIdx.x / WARP_SIZE] = sum;
+    }
+  }
+  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
+    // zero out the other entries in shared
+    shared[threadIdx.x] = (T) 0;
+  }
+  __syncthreads();
+  if (threadIdx.x / WARP_SIZE == 0) {
+    sum = warpSum(shared[threadIdx.x]);
+    if (threadIdx.x == 0) {
+      shared[0] = sum;
+    }
+  }
+  __syncthreads();
+  // Everyone picks it up, should be broadcast into the whole gradInput
+  return shared[0];
+}
--- a/encoding/lib/gpu/encoding_kernel.cu
+++ b/encoding/lib/gpu/encoding_kernel.cu
-#include <ATen/ATen.h>
 #include <vector>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
 #include "common.h"
 #include "device_tensor.h"
@@ -64,153 +65,6 @@ struct SL2GradXOp {
  DeviceTensor<DType, 1> S;
 };
-template<typename T, typename Op>
-__device__ T reduceN(
-    Op op, int b, int k, int d, int N) {
-  T sum = 0;
-  for (int x = threadIdx.x; x < N; x += blockDim.x) {
-      sum += op(b,x,k,d);
-  }
-  // sum over NumThreads within a warp
-  sum = warpSum(sum);
-  // 'transpose', and reduce within warp again
-  __shared__ T shared[32];
-  __syncthreads();
-  if (threadIdx.x % WARP_SIZE == 0) {
-      if (threadIdx.x / WARP_SIZE < 32) {
-              shared[threadIdx.x / WARP_SIZE] = sum;
-      }
-  }
-  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-      // zero out the other entries in shared
-      shared[threadIdx.x] = (T) 0;
-  }
-  __syncthreads();
-  if (threadIdx.x / WARP_SIZE == 0) {
-      sum = warpSum(shared[threadIdx.x]);
-      if (threadIdx.x == 0) {
-          shared[0] = sum;
-      }
-  }
-  __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-template<typename T, typename Op>
-__device__ T reduceD(
-    Op op, int b, int i, int k, int D) {
-  T sum = 0;
-  for (int x = threadIdx.x; x < D; x += blockDim.x) {
-      sum += op(b,i,k,x);
-  }
-  // sum over NumThreads within a warp
-  sum = warpSum(sum);
-  // 'transpose', and reduce within warp again
-  __shared__ T shared[32];
-  __syncthreads();
-  if (threadIdx.x % WARP_SIZE == 0) {
-      if (threadIdx.x / WARP_SIZE < 32) {
-              shared[threadIdx.x / WARP_SIZE] = sum;
-      }
-  }
-  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-      // zero out the other entries in shared
-      shared[threadIdx.x] = (T) 0;
-  }
-  __syncthreads();
-  if (threadIdx.x / WARP_SIZE == 0) {
-      sum = warpSum(shared[threadIdx.x]);
-      if (threadIdx.x == 0) {
-          shared[0] = sum;
-      }
-  }
-  __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-template<typename T, typename Op>
-__device__ T reduceK(
-    Op op, int b, int i, int d, int K) {
-  T sum = 0;
-  for (int x = threadIdx.x; x < K; x += blockDim.x) {
-    sum += op(b,i,x,d);
-  }
-  // sum over NumThreads within a warp
-  sum = warpSum(sum);
-  // 'transpose', and reduce within warp again
-  __shared__ T shared[32];
-  __syncthreads();
-  if (threadIdx.x % WARP_SIZE == 0) {
-    if (threadIdx.x / WARP_SIZE < 32) {
-            shared[threadIdx.x / WARP_SIZE] = sum;
-    }
-  }
-  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-    // zero out the other entries in shared
-    shared[threadIdx.x] = (T) 0;
-  }
-  __syncthreads();
-  if (threadIdx.x / WARP_SIZE == 0) {
-    sum = warpSum(shared[threadIdx.x]);
-    if (threadIdx.x == 0) {
-      shared[0] = sum;
-    }
-  }
-  __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
-template<typename T, typename Op>
-__device__ T reduceBN(
-    Op op, 
-    int k, int d, int B, int N) {
-  T sum = 0;
-  for (int batch = 0; batch < B; ++batch) {
-    for (int x = threadIdx.x; x < N; x += blockDim.x) {
-        sum += op(batch,x,k,d);
-    }
-  }
-  // sum over NumThreads within a warp
-  sum = warpSum(sum);
-  // 'transpose', and reduce within warp again
-  __shared__ T shared[32];
-  __syncthreads();
-  if (threadIdx.x % WARP_SIZE == 0) {
-    if (threadIdx.x / WARP_SIZE < 32) {
-            shared[threadIdx.x / WARP_SIZE] = sum;
-    }
-  }
-  if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
-    // zero out the other entries in shared
-    shared[threadIdx.x] = (T) 0;
-  }
-  __syncthreads();
-  if (threadIdx.x / WARP_SIZE == 0) {
-    sum = warpSum(shared[threadIdx.x]);
-    if (threadIdx.x == 0) {
-      shared[0] = sum;
-    }
-  }
-  __syncthreads();
-  // Everyone picks it up, should be broadcast into the whole gradInput
-  return shared[0];
-}
 template<typename DType, typename Acctype>
 __global__ void Aggregate_Forward_kernel (
    DeviceTensor<DType, 3> E,
@@ -225,7 +79,7 @@ __global__ void Aggregate_Forward_kernel (
  k = blockIdx.y;
  N = X.getSize(1);
  /* main operation */
-  AggOp<DType, Acctype> g(A,X,C);
+  AggOp<DType, Acctype> g(A, X, C);
  E[b][k][d] = reduceN<Acctype>(g, b, k, d, N);
 }
@@ -244,7 +98,7 @@ __global__ void Aggregate_Backward_kernel (
  k = blockIdx.x;
  D = GE.getSize(2);
  /* main operation */
-  AggBackOp<DType, Acctype> g(GE,X,C);
+  AggBackOp<DType, Acctype> g(GE, X, C);
  GA[b][i][k] = reduceD<Acctype>(g, b, i, k, D);
 }
@@ -312,7 +166,7 @@ at::Tensor Aggregate_Forward_CUDA(
    const at::Tensor C_) {
  /* Device tensors */
  auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_(); 
-  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // B, K, D
  dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
  dim3 threads(getNumThreads(X_.size(1)));
@@ -338,7 +192,7 @@ std::vector<at::Tensor> Aggregate_Backward_CUDA(
  auto gradA_ = at::zeros_like(A_);
  auto gradX_ = at::bmm(A_, GE_);
  auto gradC_ = (-GE_ * A_.sum(1).unsqueeze(2)).sum(0);
-  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  // B, K, D
  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
  dim3 threads(getNumThreads(C_.size(1)));
@@ -361,7 +215,7 @@ at::Tensor ScaledL2_Forward_CUDA(
    const at::Tensor C_,
    const at::Tensor S_) {
  auto SL_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
-  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
  dim3 threads(getNumThreads(C_.size(1)));
@@ -388,13 +242,11 @@ std::vector<at::Tensor> ScaledL2_Backward_CUDA(
  auto GX_ = at::zeros_like(X_);
  auto GC_ = at::zeros_like(C_);
  /* kernel function */
-  cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
  dim3 threads1(getNumThreads(C_.size(0)));
  dim3 blocks2(C_.size(1), C_.size(0));
  dim3 threads2(getNumThreads(X_.size(1)));
-  //std::vector<int> size{ 1, 1, K};
-  //auto GS_ = GSL_ * (SL_ / at::_unsafe_view(S_, size))
  auto GS_ = (GSL_ * (SL_ / S_.view({1, 1, C_.size(0)}))).sum(0).sum(0);
  AT_DISPATCH_FLOATING_TYPES(X_.type(), "ScaledL2_Backward_CUDA", ([&] {
    /* Device tensors */

--- a/encoding/lib/gpu/encodingv2_kernel.cu
+++ b/encoding/lib/gpu/encodingv2_kernel.cu
+#include <vector>
+#include <ATen/ATen.h>
+#include <ATen/Functions.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "common.h"
+#include "device_tensor.h"
+namespace {
+template<typename DType, typename Acctype>
+struct KD2Op {
+  __device__ KD2Op(DeviceTensor<DType, 3> x,
+                   DeviceTensor<DType, 2> c,
+                   DeviceTensor<DType, 2> std) : X(x), C(c), STD(std) {}
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) 
+  {
+      DType r = (X[b][i][d] - C[k][d]) / STD[k][d];
+      return ScalarConvert<DType, Acctype>::to(r * r);
+  }
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+  DeviceTensor<DType, 2> STD;
+};
+template<typename DType, typename Acctype>
+__global__ void Encoding_Dist_Forward_kernel (
+    DeviceTensor<DType, 3> KD,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C,
+    DeviceTensor<DType, 2> STD) {
+  /* declarations of the variables */
+  int b, k, i, D;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  k = blockIdx.x;
+  i = blockIdx.y;
+  D = X.getSize(2);
+  /* main operation */
+  KD2Op<DType, Acctype> g(X, C, STD);
+  KD[b][i][k] = reduceD<Acctype>(g, b, i, k, D);;
+}
+template<typename DType, typename Acctype>
+struct EncGradXOp {
+  __device__ EncGradXOp(
+    DeviceTensor<DType, 3> gkd,
+    DeviceTensor<DType, 3> x,
+    DeviceTensor<DType, 2> c,
+    DeviceTensor<DType, 2> std) : GKD(gkd), X(x), C(c), STD(std) {}
+    // DeviceTensor<DType, 1> s, S(s)
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
+    return ScalarConvert<DType, Acctype>::to(
+      2 * GKD[b][i][k] * (X[b][i][d] - C[k][d]) / 
+      (STD[k][d] * STD[k][d]));
+  }
+  DeviceTensor<DType, 3> GKD;
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+  DeviceTensor<DType, 2> STD;
+  // DeviceTensor<DType, 1> S;
+};
+template<typename DType, typename Acctype>
+__global__ void Encoding_GradX_kernel (
+    DeviceTensor<DType, 3> GKD,
+    DeviceTensor<DType, 3> GX,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C,
+    DeviceTensor<DType, 2> STD) {
+    // DeviceTensor<DType, 1> S
+  /* declarations of the variables */
+  int b, d, i, K;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  i = blockIdx.y;
+  d = blockIdx.x;
+  K = C.getSize(0);
+  /* main operation */
+  EncGradXOp<DType, Acctype> g(GKD, X, C, STD);
+  GX[b][i][d] = reduceK<Acctype>(g, b, i, d, K);
+}
+template<typename DType, typename Acctype>
+struct EncGradSTDOp {
+  __device__ EncGradSTDOp(
+    DeviceTensor<DType, 3> gkd,
+    DeviceTensor<DType, 3> x,
+    DeviceTensor<DType, 2> c,
+    DeviceTensor<DType, 2> std) : GKD(gkd), X(x), C(c), STD(std) {}
+    // DeviceTensor<DType, 1> s, S(s)
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
+    return ScalarConvert<DType, Acctype>::to(
+      -2 * GKD[b][i][k] * (X[b][i][d] - C[k][d]) *
+      (X[b][i][d] - C[k][d]) / (STD[k][d] * STD[k][d] * STD[k][d]));
+  }
+  DeviceTensor<DType, 3> GKD;
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+  DeviceTensor<DType, 2> STD;
+  // DeviceTensor<DType, 1> S;
+};
+template<typename DType, typename Acctype>
+__global__ void Encoding_GradCSTD_kernel (
+    DeviceTensor<DType, 3> GKD,
+    DeviceTensor<DType, 2> GC,
+    DeviceTensor<DType, 2> GSTD,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C,
+    DeviceTensor<DType, 2> STD) {
+  /* declarations of the variables */
+  int k, d, B, N;
+  /* Get the index and channels */ 
+  d = blockIdx.x;
+  k = blockIdx.y;
+  B = X.getSize(0);
+  N = X.getSize(1);
+  /* main operation */
+  EncGradXOp<DType, Acctype> g1(GKD, X, C, STD);
+  EncGradSTDOp<DType, Acctype> g2(GKD, X, C, STD);
+  GC[k][d] = -reduceBN<Acctype>(g1, k, d, B, N);
+  GSTD[k][d] += reduceBN<Acctype>(g2, k, d, B, N);
+}
+template<typename DType, typename Acctype>
+struct EncGradSTDXOp {
+  __device__ EncGradSTDXOp(
+    DeviceTensor<DType, 2> gstd,
+    DeviceTensor<DType, 3> x,
+    DeviceTensor<DType, 2> c,
+    DeviceTensor<DType, 2> std) : GSTD(gstd), X(x), C(c), STD(std) {}
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
+    return ScalarConvert<DType, Acctype>::to(
+      GSTD[k][d] * (X[b][i][d] - C[k][d]) / STD[k][d]);
+  }
+  DeviceTensor<DType, 2> GSTD;
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+  DeviceTensor<DType, 2> STD;
+};
+template<typename DType, typename Acctype>
+__global__ void Encoding_GradSTDX_kernel (
+    DeviceTensor<DType, 2> GSTD,
+    DeviceTensor<DType, 3> GX,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C,
+    DeviceTensor<DType, 2> STD,
+    int N) {
+  /* declarations of the variables */
+  int b, d, i, K;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  i = blockIdx.y;
+  d = blockIdx.x;
+  K = C.getSize(0);
+  /* main operation */
+  EncGradSTDXOp<DType, Acctype> g(GSTD, X, C, STD);
+  GX[b][i][d] += reduceK<Acctype>(g, b, i, d, K) / N;
+}
+template<typename DType, typename Acctype>
+struct AggOpV2 {
+  __device__ AggOpV2(DeviceTensor<DType, 3> a,
+                     DeviceTensor<DType, 3> x,
+                     DeviceTensor<DType, 2> c,
+                     DeviceTensor<DType, 2> std) : A(a), X(x), C(c), STD(std) {}
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
+    return ScalarConvert<DType, Acctype>::to(A[b][i][k] * (X[b][i][d] - C[k][d]) /
+                                             STD[k][d]);
+  }
+  DeviceTensor<DType, 3> A;
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+  DeviceTensor<DType, 2> STD;
+};
+template<typename DType, typename Acctype>
+__global__ void AggregateV2_Forward_kernel (
+    DeviceTensor<DType, 3> E,
+    DeviceTensor<DType, 3> A,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C,
+    DeviceTensor<DType, 2> STD) {
+  /* declarations of the variables */
+  int b, k, d, N;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  d = blockIdx.x;
+  k = blockIdx.y;
+  N = X.getSize(1);
+  /* main operation */
+  AggOpV2<DType, Acctype> g(A, X, C, STD);
+  E[b][k][d] = reduceN<Acctype>(g, b, k, d, N);
+}
+template<typename DType, typename Acctype>
+struct AggV2BackOp {
+  __device__ AggV2BackOp(DeviceTensor<DType, 3> g,
+                         DeviceTensor<DType, 3> x,
+                         DeviceTensor<DType, 2> c,
+                         DeviceTensor<DType, 2> std) : G(g), X(x), C(c), STD(std) {}
+  __device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
+    return ScalarConvert<DType, Acctype>::to(G[b][k][d] * (X[b][i][d] - C[k][d]) /
+                                             STD[k][d]);
+  }
+  DeviceTensor<DType, 3> G;
+  DeviceTensor<DType, 3> X;
+  DeviceTensor<DType, 2> C;
+  DeviceTensor<DType, 2> STD;
+};
+template<typename DType, typename Acctype>
+__global__ void AggregateV2_Backward_kernel (
+    DeviceTensor<DType, 3> GA,
+    DeviceTensor<DType, 3> GE,
+    DeviceTensor<DType, 3> A,
+    DeviceTensor<DType, 3> X,
+    DeviceTensor<DType, 2> C,
+    DeviceTensor<DType, 2> STD) {
+  /* declarations of the variables */
+  int b, k, i, D;
+  /* Get the index and channels */ 
+  b = blockIdx.z;
+  i = blockIdx.y;
+  k = blockIdx.x;
+  D = GE.getSize(2);
+  /* main operation */
+  AggV2BackOp<DType, Acctype> g(GE, X, C, STD);
+  GA[b][i][k] = reduceD<Acctype>(g, b, i, k, D);
+}
+} // namespace
+at::Tensor Encoding_Dist_Inference_Forward_CUDA(
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor STD_) {
+    // const at::Tensor S_,
+  // X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
+  auto KD_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
+  // E(x), E(x^2)
+  int N = X_.size(0) * X_.size(1);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
+  dim3 threads(getNumThreads(C_.size(1)));
+  // calculate the kernel distance
+  AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Inference_Forward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> KD = devicetensor<scalar_t, 3>(KD_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
+    /* kernel function */
+    Encoding_Dist_Forward_kernel<scalar_t, scalar_t>
+        <<<blocks, threads, 0, stream>>> (KD, X, C, STD);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return KD_;
+}
+std::vector<at::Tensor> Encoding_Dist_Inference_Backward_CUDA(
+    const at::Tensor GKD_,
+    const at::Tensor KD_,
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor STD_) {
+  auto GX_ = at::zeros_like(X_);
+  auto GC_ = at::zeros_like(C_);
+  auto GSTD_ = at::zeros_like(STD_);
+  /* kernel function */
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
+  dim3 threads1(getNumThreads(C_.size(0)));
+  dim3 blocks2(C_.size(1), C_.size(0));
+  dim3 threads2(getNumThreads(X_.size(1)));
+  int N = X_.size(0) * X_.size(1);
+  AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Backward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> GKD = devicetensor<scalar_t, 3>(GKD_);
+    DeviceTensor<scalar_t, 2> GSTD = devicetensor<scalar_t, 2>(GSTD_);
+    DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
+    DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
+    Encoding_GradX_kernel<scalar_t, scalar_t>
+      <<<blocks1, threads1, 0, stream>>> (GKD, GX, X, C, STD);
+    AT_ASSERT(cudaGetLastError() == cudaSuccess);
+    Encoding_GradCSTD_kernel<scalar_t, scalar_t>
+      <<<blocks2, threads2, 0, stream>>> (GKD, GC, GSTD, X, C, STD);
+    AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  }));
+  return {GX_, GC_, GSTD_};
+}
+std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
+    const at::Tensor X_,
+    const at::Tensor C_,
+    double eps) {
+    // const at::Tensor S_,
+  // X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
+  auto KD_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
+  // E(x), E(x^2)
+  int N = X_.size(0) * X_.size(1);
+  auto SVar_ = (X_.pow(2).sum(0).sum(0).view({1, X_.size(2)}) -
+                2 * C_ * X_.sum(0).sum(0).view({1, X_.size(2)})).expand_as(C_) +
+               C_.pow(2) * N;
+  auto STD_ = at::sqrt(SVar_ / N + eps);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
+  dim3 threads(getNumThreads(C_.size(1)));
+  // calculate the kernel distance
+  AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Forward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> KD = devicetensor<scalar_t, 3>(KD_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
+    /* kernel function */
+    Encoding_Dist_Forward_kernel<scalar_t, scalar_t>
+        <<<blocks, threads, 0, stream>>> (KD, X, C, STD);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return {KD_, STD_, SVar_ / (N - 1)};
+}
+std::vector<at::Tensor> Encoding_Dist_Backward_CUDA(
+    const at::Tensor GKD_,
+    const at::Tensor GSTD_,
+    const at::Tensor KD_,
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor STD_) {
+  auto GX_ = at::zeros_like(X_);
+  auto GC_ = at::zeros_like(C_);
+  auto GSTD2_ = GSTD_.clone();
+  /* kernel function */
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
+  dim3 threads1(getNumThreads(C_.size(0)));
+  dim3 blocks2(C_.size(1), C_.size(0));
+  dim3 threads2(getNumThreads(X_.size(1)));
+  int N = X_.size(0) * X_.size(1);
+  AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Backward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> GKD = devicetensor<scalar_t, 3>(GKD_);
+    DeviceTensor<scalar_t, 2> GSTD = devicetensor<scalar_t, 2>(GSTD2_);
+    DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
+    DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
+    Encoding_GradX_kernel<scalar_t, scalar_t>
+      <<<blocks1, threads1, 0, stream>>> (GKD, GX, X, C, STD);
+    AT_ASSERT(cudaGetLastError() == cudaSuccess);
+    Encoding_GradCSTD_kernel<scalar_t, scalar_t>
+      <<<blocks2, threads2, 0, stream>>> (GKD, GC, GSTD, X, C, STD);
+    AT_ASSERT(cudaGetLastError() == cudaSuccess);
+    Encoding_GradSTDX_kernel<scalar_t, scalar_t>
+      <<<blocks1, threads1, 0, stream>>> (GSTD, GX, X, C, STD, N);
+    AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  }));
+  // d_sigma/d_c
+  GC_ = GC_ - GSTD2_ * (X_.mean(0).mean(0) - C_) / STD_;
+  return {GX_, GC_};
+}
+at::Tensor AggregateV2_Forward_CUDA(
+    const at::Tensor A_,
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor STD_) {
+  /* Device tensors */
+  auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_(); 
+  // auto IS_ = 1.0f / (S_ + eps).sqrt();
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  // B, K, D
+  dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
+  dim3 threads(getNumThreads(X_.size(1)));
+  AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Forward_CUDA", ([&] {
+    DeviceTensor<scalar_t, 3> E = devicetensor<scalar_t, 3>(E_);
+    DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
+    /* kernel function */
+    AggregateV2_Forward_kernel<scalar_t, scalar_t>
+      <<<blocks, threads, 0, stream>>>(E, A, X, C, STD);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return E_;
+}
+std::vector<at::Tensor> AggregateV2_Backward_CUDA(
+    const at::Tensor GE_,
+    const at::Tensor E_,
+    const at::Tensor A_,
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor STD_) {
+  auto gradA_ = at::zeros_like(A_);
+  auto gradX_ = at::bmm(A_ , (GE_ / STD_.unsqueeze(0)));
+  auto gradC_ = -(A_.sum(1).unsqueeze(2) * GE_ / STD_.unsqueeze(0)).sum(0);
+  auto gradSTD_ = -(GE_ * E_).sum(0) / STD_;
+  // auto gradS_ = -0.5 * (GE_ * E_).sum(2).sum(0) / S_;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  // B, K, D
+  dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
+  dim3 threads(getNumThreads(C_.size(1)));
+  AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Backward_CUDA", ([&] {
+    /* Device tensors */
+    DeviceTensor<scalar_t, 3> GA = devicetensor<scalar_t, 3>(gradA_);
+    DeviceTensor<scalar_t, 3> GE = devicetensor<scalar_t, 3>(GE_);
+    DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
+    DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
+    DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
+    DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
+    AggregateV2_Backward_kernel<scalar_t, scalar_t>
+      <<<blocks, threads, 0, stream>>> (GA, GE, A, X, C, STD);
+  }));
+  AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  return {gradA_, gradX_, gradC_, gradSTD_};
+}
--- a/encoding/lib/gpu/nms_kernel.cu
+++ b/encoding/lib/gpu/nms_kernel.cu
+#include <ATen/ATen.h>
+#include "ATen/NativeFunctions.h"
+#include <ATen/cuda/CUDAContext.h>
+template <typename scalar>
+__device__ __forceinline__ scalar fmin(scalar a, scalar b) {
+  return a > b ? b : a;
+}
+template <typename scalar>
+__device__ __forceinline__ scalar fmax(scalar a, scalar b) {
+  return a > b ? a : b;
+}
+template <typename scalar>
+__device__ __forceinline__ scalar IoU(const scalar* box_x, const scalar* box_y) {
+  // Calculate IoU between the boxes.
+  scalar rightmost_l = fmax(box_x[0], box_y[0]);
+  scalar leftmost_r = fmin(box_x[0] + box_x[2], box_y[0] + box_y[2]);
+  scalar delta_x = fmax((scalar)0., leftmost_r - rightmost_l);
+  scalar bottommost_tp = fmax(box_x[1], box_y[1]);
+  scalar topmost_b = fmin(box_x[1] + box_x[3], box_y[1] + box_y[3]);
+  scalar delta_y = fmax((scalar)0., topmost_b - bottommost_tp);
+  scalar uni = box_x[2] * box_x[3] + box_y[2] * box_y[3];
+  return delta_x * delta_y / (uni - delta_x * delta_y);
+}
+template <typename scalar>
+__global__ void nms_kernel(unsigned char* mask, 
+                          const scalar* boxes,
+                          const int64_t* inds,
+                          const int64_t num_boxes,
+                          double thresh) {
+//A pretty straightforward implementation, analogous to the standard serial
+//version but with the IoUs computed and mask updated in parallel. We access
+//the box data through an array of sorted indices rather than physically
+//sorting it: unless one has an inordinate number of boxes (O(10^5), whereas
+//for example in the faster rcnn paper they feed 6000 per batch) the
+//data will fit in L2 so sorting it won't actually reduce the number of
+//messy reads from global.
+  int col = 0;
+  while(col < num_boxes-1)
+  {
+    for(int i = threadIdx.x; i < num_boxes-1; i+=blockDim.x)
+      if(i >= col)
+      {
+        scalar iou = IoU(&boxes[4*inds[i+1+num_boxes*blockIdx.x] + 4*num_boxes*blockIdx.x],
+                         &boxes[4*inds[col+num_boxes*blockIdx.x] + 4*num_boxes*blockIdx.x]);
+        mask[i+1+blockIdx.x*num_boxes] *= (iou>thresh) ? 0 : 1;
+      }
+    __syncthreads();
+    ++col;
+    while((col < num_boxes - 1) && (mask[col+blockIdx.x*num_boxes]==0))
+      ++col;
+  }
+}
+std::vector<at::Tensor> Non_Max_Suppression_CUDA(
+    const at::Tensor& input,
+    const at::Tensor& scores,
+    double thresh) {
+  AT_ASSERT(input.ndimension() == 3);
+  AT_ASSERT(scores.ndimension() == 2);
+  AT_ASSERT(input.size(0) == scores.size(0));
+  AT_ASSERT(input.size(1) == scores.size(1));
+  AT_ASSERT(input.size(2) == 4);
+  AT_ASSERT(input.is_contiguous());
+  AT_ASSERT(scores.is_contiguous());
+  AT_ASSERT(input.type().scalarType() == at::kFloat || input.type().scalarType() == at::kDouble)
+  AT_ASSERT(scores.type().scalarType() == at::kFloat || scores.type().scalarType() == at::kDouble)
+  auto num_boxes = input.size(1);
+  auto batch_size = input.size(0);
+  auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
+  mask.fill_(1);
+  //need the indices of the boxes sorted by score.
+  at::Tensor sorted_inds = std::get<1>(scores.sort(-1, true));
+  dim3 mask_block(512); //would be nice to have 1024 here for gpus that support it,
+                        //but not sure how to do this cleanly without calling
+                        //cudaGetDeviceProperties in the funcion body...
+  dim3 mask_grid(batch_size);
+  if(input.type().scalarType() == at::kFloat)
+  {
+      nms_kernel<<<mask_grid, mask_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                                        mask.data<unsigned char>(),
+                                        input.data<float>(),
+                                        sorted_inds.data<int64_t>(),
+                                        num_boxes,
+                                        thresh);
+      AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  }
+  else
+  {
+      nms_kernel<<<mask_grid, mask_block, 0, at::cuda::getCurrentCUDAStream()>>>(
+                                        mask.data<unsigned char>(),
+                                        input.data<double>(),
+                                        sorted_inds.data<int64_t>(),
+                                        num_boxes,
+                                        thresh);
+      AT_ASSERT(cudaGetLastError() == cudaSuccess);
+  }
+  //It's not entirely clear what the best thing to return is here. The algorithm will
+  //produce a different number of boxes for each batch, so there is no obvious way of
+  //way of returning the surving boxes/indices as a tensor. Returning a mask on the
+  //sorted boxes together with the sorted indices seems reasonable; that way, the user
+  //can easily take the N highest-scoring surviving boxes to form a tensor if they wish. 
+  return {mask, sorted_inds};
+}
--- a/encoding/lib/gpu/operator.cpp
+++ b/encoding/lib/gpu/operator.cpp
 #include "operator.h"
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("roi_align_forward", &ROIAlignForwardCUDA, "ROI Align forward (CUDA)");
+  m.def("roi_align_forward", &ROIAlign_Forward_CUDA, "ROI Align forward (CUDA)");
-  m.def("roi_align_backward", &ROIAlignBackwardCUDA, "ROI Align backward (CUDA)");
+  m.def("roi_align_backward", &ROIAlign_Backward_CUDA, "ROI Align backward (CUDA)");
+  m.def("non_max_suppression", &Non_Max_Suppression_CUDA, "NMS (CUDA)");
  m.def("aggregate_forward", &Aggregate_Forward_CUDA, "Aggregate forward (CUDA)");
  m.def("aggregate_backward", &Aggregate_Backward_CUDA, "Aggregate backward (CUDA)");
  m.def("scaled_l2_forward", &ScaledL2_Forward_CUDA, "ScaledL2 forward (CUDA)");
@@ -11,4 +12,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("batchnorm_backward", &BatchNorm_Backward_CUDA, "BatchNorm backward (CUDA)");
  m.def("sumsquare_forward", &Sum_Square_Forward_CUDA, "SumSqu forward (CUDA)");
  m.def("sumsquare_backward", &Sum_Square_Backward_CUDA, "SumSqu backward (CUDA)");
+  m.def("encoding_dist_forward", &Encoding_Dist_Forward_CUDA, "EncDist forward (CUDA)");
+  m.def("encoding_dist_backward", &Encoding_Dist_Backward_CUDA, "Assign backward (CUDA)");
+  m.def("encoding_dist_inference_forward", &Encoding_Dist_Inference_Forward_CUDA,
+        "EncDist Inference forward (CUDA)");
+  m.def("encoding_dist_inference_backward", &Encoding_Dist_Inference_Backward_CUDA,
+        "Assign Inference backward (CUDA)");
+  m.def("aggregatev2_forward", &AggregateV2_Forward_CUDA, "AggregateV2 forward (CUDA)");
+  m.def("aggregatev2_backward", &AggregateV2_Backward_CUDA, "AggregateV2 backward (CUDA)");
 }
--- a/encoding/lib/gpu/operator.h
+++ b/encoding/lib/gpu/operator.h
 #include <torch/torch.h>
 #include <vector>
-at::Tensor ROIAlignForwardCUDA(
+at::Tensor ROIAlign_Forward_CUDA(
  const at::Tensor input,
  const at::Tensor rois,
  int64_t pooled_height,
@@ -9,7 +9,7 @@ at::Tensor ROIAlignForwardCUDA(
  double spatial_scale,
  int64_t sample_ratio);
-at::Tensor ROIAlignBackwardCUDA(
+at::Tensor ROIAlign_Backward_CUDA(
  const at::Tensor rois,
  const at::Tensor grad_output,
  int64_t b_size,
@@ -21,6 +21,11 @@ at::Tensor ROIAlignBackwardCUDA(
  double spatial_scale,
  int64_t sampling_ratio);
+std::vector<at::Tensor> Non_Max_Suppression_CUDA(
+  const at::Tensor& input,
+  const at::Tensor& scores,
+  double thresh);
 at::Tensor Aggregate_Forward_CUDA(
  const at::Tensor A_,
  const at::Tensor X_,
@@ -67,3 +72,42 @@ at::Tensor Sum_Square_Backward_CUDA(
  const at::Tensor input_,
  const at::Tensor gradSum_,
  const at::Tensor gradSquare_);
+at::Tensor Encoding_Dist_Inference_Forward_CUDA(
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor STD_);
+std::vector<at::Tensor> Encoding_Dist_Inference_Backward_CUDA(
+    const at::Tensor GKD_,
+    const at::Tensor KD_,
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor STD_);
+std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
+  const at::Tensor X,
+  const at::Tensor C,
+  double eps);
+std::vector<at::Tensor> Encoding_Dist_Backward_CUDA(
+    const at::Tensor GKD_,
+    const at::Tensor GSTD_,
+    const at::Tensor KD_,
+    const at::Tensor X_,
+    const at::Tensor C_,
+    const at::Tensor STD_);
+at::Tensor AggregateV2_Forward_CUDA(
+  const at::Tensor A_,
+  const at::Tensor X_,
+  const at::Tensor C_,
+  const at::Tensor STD_);
+std::vector<at::Tensor> AggregateV2_Backward_CUDA(
+  const at::Tensor GE_,
+  const at::Tensor E_,
+  const at::Tensor A_,
+  const at::Tensor X_,
+  const at::Tensor C_,
+  const at::Tensor STD_);
--- a/encoding/lib/gpu/roi_align_kernel.cu
+++ b/encoding/lib/gpu/roi_align_kernel.cu
 #include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -346,7 +347,7 @@ __global__ void RoIAlignBackwardKernel(
 } // namespace
-at::Tensor ROIAlignForwardCUDA(
+at::Tensor ROIAlign_Forward_CUDA(
    const at::Tensor input,
    const at::Tensor rois,
    int64_t pooled_height,
@@ -370,12 +371,12 @@ at::Tensor ROIAlignForwardCUDA(
  auto count = output.numel();
-  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlignForwardCUDA", ([&] {
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_Forward_CUDA", ([&] {
    RoIAlignForwardKernel<scalar_t>
      <<<ROI_GET_BLOCKS(count),
         ROI_CUDA_NUM_THREADS,
         0,
-         at::globalContext().getCurrentCUDAStream()>>>(
+         at::cuda::getCurrentCUDAStream()>>>(
          count,
          input.data<scalar_t>(),
          static_cast<scalar_t>(spatial_scale),
@@ -392,7 +393,7 @@ at::Tensor ROIAlignForwardCUDA(
  return output;
 }
-at::Tensor ROIAlignBackwardCUDA(
+at::Tensor ROIAlign_Backward_CUDA(
    const at::Tensor rois,
    const at::Tensor grad_output,
    int64_t b_size,
@@ -417,12 +418,12 @@ at::Tensor ROIAlignBackwardCUDA(
  auto num_rois = rois.size(0);
  auto count = grad_output.numel();
-  AT_DISPATCH_FLOATING_TYPES(rois.type(), "ROIAlignBackwardCUDA", ([&] {
+  AT_DISPATCH_FLOATING_TYPES(rois.type(), "ROIAlign_Backward_CUDA", ([&] {
    RoIAlignBackwardKernel<scalar_t>
      <<<ROI_GET_BLOCKS(count),
         ROI_CUDA_NUM_THREADS,
         0,
-         at::globalContext().getCurrentCUDAStream()>>>(
+         at::cuda::getCurrentCUDAStream()>>>(
          count,
          grad_output.data<scalar_t>(),
          num_rois,