v0.4.2

2d21747a · Zhang · 7e19143c · 7e19143c · 7e19143c · 7e19143c
Commit 2d21747a authored Jun 04, 2018 by Zhang
20 changed files
--- a/encoding/kernel/generic/syncbn_kernel.h
+++ b/encoding/kernel/generic/syncbn_kernel.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/syncbn_kernel.h"
-#else
-
-void Encoding_(BatchNorm_Forward)(THCState *state, 
-    THCTensor *output_, THCTensor *input_, 
-    THCTensor *mean_, THCTensor *invstd_,
-    THCTensor *gamma_, THCTensor *beta_);
-
-void Encoding_(BatchNorm_Backward)(THCState *state, 
-    THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
-    THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-    THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-    THCTensor *gradMean_, THCTensor *gradStd_, int train);
-
-void Encoding_(Sum_Square_Forward)(THCState *state, 
-    THCTensor *input_, THCTensor *sum_, THCTensor *square_);
-
-void Encoding_(Sum_Square_Backward)(THCState *state, 
-    THCTensor *gradInput, THCTensor *input_, 
-    THCTensor *gradSum_, THCTensor *gradSquare_);
-
-#endif
--- a/encoding/kernel/include/README.rst
+++ b/encoding/kernel/include/README.rst
-Make a copy from PyTorch lib to make the compilation easier for users, due to so many questions and requests.
--- a/encoding/kernel/include/THCDeviceTensor-inl.cuh
+++ b/encoding/kernel/include/THCDeviceTensor-inl.cuh
-#include <assert.h>
-
-namespace detail {
-
-template <typename T, int N>
-__host__ __device__ void copy(T to[N], T from[N]) {
-  for (int i = 0; i < N; ++i) {
-    to[i] = from[i];
-  }
-}
-
-} // namespace detail
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor()
-    : data_(NULL) {
-  thc_static_assert(Dim > 0);
-
-  for (int i = 0; i < Dim; ++i) {
-    size_[i] = 0;
-    stride_[i] = (IndexT) 1;
-  }
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::
-#ifdef _MSC_VER
-THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim])
-#else
-THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim])
-#endif
-    : data_(data) {
-  thc_static_assert(Dim > 0);
-
-  for (int i = 0; i < Dim; ++i) {
-    size_[i] = sizes[i];
-  }
-
-  stride_[Dim - 1] = (IndexT) 1;
-  for (int i = Dim - 2; i >= 0; --i) {
-    stride_[i] = stride_[i + 1] * sizes[i + 1];
-  }
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor(
-#ifdef _MSC_VER
-  DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim])
-#else
-  DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
-#endif
-    : data_(data) {
-  thc_static_assert(Dim > 0);
-
-  for (int i = 0; i < Dim; ++i) {
-    size_[i] = sizes[i];
-    stride_[i] = strides[i];
-  }
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int OtherDim>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isSameSizeAndStride(
-  const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const {
-  if (Dim != OtherDim) {
-    return false;
-  }
-
-  for (int i = 0; i < Dim; ++i) {
-    if (size_[i] != rhs.size_[i]) {
-      return false;
-    }
-
-    if (stride_[i] != rhs.stride_[i]) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <typename U>
-__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() {
-  thc_static_assert(sizeof(U) == sizeof(T));
-
-  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
-    reinterpret_cast<U*>(data_), size_, stride_);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <typename U>
-__host__ __device__ const THCDeviceTensor<U, Dim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
-  thc_static_assert(sizeof(U) == sizeof(T));
-
-  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
-    reinterpret_cast<U*>(data_), size_, stride_);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ ptrdiff_t
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
-  ptrdiff_t size = getSize(0);
-
-  for (int i = 1; i < Dim; ++i) {
-    size *= getSize(i);
-  }
-
-  return size;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguous() const {
-  return isContiguousRange(0, Dim);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized(int i) const {
-  if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
-    return true;
-  } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
-             ((getStride(i - 1) / getStride(i)) >= getSize(i))) {
-    return true;
-  }
-
-  return false;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized() const {
-  for (int i = 0; i < Dim; ++i) {
-    if (!isConsistentlySized(i)) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguousRange(
-  int first, int last) const {
-
-  int64_t prevSize = last < Dim ? getStride(last) * getSize(last) : 1;
-
-  for (int i = last - 1; i >= first; --i) {
-    if (getSize(i) != (IndexT) 1) {
-      if (getStride(i) == prevSize) {
-        prevSize *= getSize(i);
-      } else {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::transpose(int dim1,
-                                                      int dim2) const {
-#ifdef __CUDA_ARCH__
-  // Device code
-  assert(dim1 >= 0 && dim1 < Dim);
-  assert(dim1 >= 0 && dim2 < Dim);
-#else
-  // Host code
-  if (dim1 < 0 || dim1 >= Dim) {
-    THError("dim1 out of bounds");
-  }
-
-  if (dim2 < 0 || dim2 >= Dim) {
-    THError("dim2 out of bounds");
-  }
-#endif
-
-  IndexT newSize[Dim];
-  IndexT newStride[Dim];
-
-  for (int i = 0; i < Dim; ++i) {
-    newSize[i] = size_[i];
-    newStride[i] = stride_[i];
-  }
-
-  IndexT tmp = newSize[dim1];
-  newSize[dim1] = newSize[dim2];
-  newSize[dim2] = tmp;
-
-  tmp = newStride[dim1];
-  newStride[dim1] = newStride[dim2];
-  newStride[dim2] = tmp;
-
-  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastOuter() {
-  // Can only create tensors of greater dimension
-  thc_static_assert(NewDim > Dim);
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  int shift = NewDim - Dim;
-
-  for (int i = 0; i < NewDim; ++i) {
-    if (i < shift) {
-      // These are the extended dimensions
-      newSize[i] = (IndexT) 1;
-      newStride[i] = size_[0] * stride_[0];
-    } else {
-      // Shift the remaining dimensions
-      newSize[i] = size_[i - shift];
-      newStride[i] = stride_[i - shift];
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastInner() {
-  // Can only create tensors of greater dimension
-  thc_static_assert(NewDim > Dim);
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  for (int i = 0; i < NewDim; ++i) {
-    if (i < Dim) {
-      // Existing dimensions get copied over
-      newSize[i] = size_[i];
-      newStride[i] = stride_[i];
-    } else {
-      // Extended dimensions
-      newSize[i] = (IndexT) 1;
-      newStride[i] = (IndexT) 1;
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastOuter() {
-  // Can only create tensors of lesser dimension
-  thc_static_assert(NewDim < Dim);
-
-  // We can't downcast non-contiguous tensors, since it leaves
-  // garbage data in the tensor. The tensor needs to be contiguous
-  // in all of the dimensions we are collapsing (no padding in
-  // them).
-  bool cont = isContiguousRange(0, Dim - NewDim);
-#ifdef __CUDA_ARCH__
-  // Device code
-  assert(cont);
-#else
-  // Host code
-  if (!cont) {
-    THError("Can only downcast contiguous tensors");
-  }
-#endif
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  int ignoredDims = Dim - NewDim;
-  IndexT collapsedSize = 1;
-
-  for (int i = 0; i < Dim; ++i) {
-    if (i < ignoredDims) {
-      // Collapse these dimensions
-      collapsedSize *= getSize(i);
-    } else {
-      // Non-collapsed dimensions
-      if (i == ignoredDims) {
-        // This is the first non-collapsed dimension
-        newSize[i - ignoredDims] = collapsedSize * getSize(i);
-      } else {
-        // Subsequent non-collapsed dimensions
-        newSize[i - ignoredDims] = getSize(i);
-      }
-
-      newStride[i - ignoredDims] = getStride(i);
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastInner() {
-  // Can only create tensors of lesser dimension
-  thc_static_assert(NewDim < Dim);
-
-  // We can't downcast non-contiguous tensors, since it leaves
-  // garbage data in the tensor. The tensor needs to be contiguous
-  // in all of the dimensions we are collapsing (no padding in
-  // them).
-  bool cont = isContiguousRange(NewDim, Dim);
-#ifdef __CUDA_ARCH__
-  // Device code
-  assert(cont);
-#else
-  // Host code
-  if (!cont) {
-    THError("Can only downcast contiguous tensors");
-  }
-#endif
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  IndexT collapsedSize = 1;
-
-  for (int i = Dim - 1; i >= 0; --i) {
-    if (i >= NewDim) {
-      // Collapse these dimensions
-      collapsedSize *= getSize(i);
-    } else {
-      // Non-collapsed dimensions
-      if (i == NewDim - 1) {
-        // This is the first non-collapsed dimension
-        newSize[i] = collapsedSize * getSize(i);
-        newStride[i] = getStride(Dim - 1);
-      } else {
-        // Subsequent non-collapsed dimensions
-        newSize[i] = getSize(i);
-        newStride[i] = getStride(i);
-      }
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int SubDim>
-__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view(DataPtrType at) {
-  thc_static_assert(SubDim >= 1 && SubDim < Dim);
-
-  IndexT viewSizes[SubDim];
-  IndexT viewStrides[SubDim];
-
-  for (int i = 0; i < SubDim; ++i) {
-    viewSizes[i] = size_[Dim - SubDim + i];
-    viewStrides[i] = stride_[Dim - SubDim + i];
-  }
-
-  return THCDeviceTensor<T, SubDim, IndexT, PtrTraits>(
-    at, viewSizes, viewStrides);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int SubDim>
-__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view() {
-  return view<SubDim>(data_);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-void
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::zero(cudaStream_t stream) {
-#ifdef __CUDA_ARCH__
-  assert(isContiguous());
-#else
-  if (!isContiguous()) {
-    THError("fillAsync only works on contiguous data");
-  }
-#endif
-
-  cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream);
-}
--- a/encoding/kernel/include/THCDeviceTensor.cuh
+++ b/encoding/kernel/include/THCDeviceTensor.cuh
-#ifndef THC_DEVICE_TENSOR_INC
-#define THC_DEVICE_TENSOR_INC
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
-template <bool>
-struct THCStaticAssert;
-
-template <>
-struct THCStaticAssert<true> {
-};
-
-#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())
-
-/// Our tensor type
-template <typename T,
-          int Dim,
-          typename IndexT,
-          template <typename U> class PtrTraits>
-class THCDeviceTensor;
-
-/// Type of a subspace of a tensor
-namespace detail {
-template <typename TensorType,
-          int SubDim,
-          template <typename U> class PtrTraits>
-class THCDeviceSubTensor;
-}
-
-template <typename T>
-struct RestrictPtrTraits {
-  typedef T* __restrict__ PtrType;
-};
-
-template <typename T>
-struct DefaultPtrTraits {
-  typedef T* PtrType;
-};
-
-/**
-   Templated multi-dimensional array that supports strided access of
-   elements. Main access is through `operator[]`; e.g.,
-   `tensor[x][y][z]`.
-
- `T` is the contained type (e.g., `float`)
- `Dim` is the tensor rank
- `IndexT` is the integer type used for size/stride arrays, and for
- all indexing math. Default is `int`, but for large tensors, `int64_t`
- can be used instead.
- `PtrTraits` are traits applied to our data pointer (T*). By default,
- this is just T*, but RestrictPtrTraits can be used to apply T*
- __restrict__ for alias-free analysis.
-*/
-template <typename T,
-          int Dim,
-          typename IndexT = int,
-          template <typename U> class PtrTraits = DefaultPtrTraits>
-class THCDeviceTensor {
- public:
-  enum { NumDim = Dim };
-  typedef T DataType;
-  typedef IndexT IndexType;
-  typedef typename PtrTraits<T>::PtrType DataPtrType;
-  typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;
-
-  /// Default constructor
-  __host__ __device__ THCDeviceTensor();
-
-  /// Constructor that calculates strides with no padding
-  __host__ __device__ THCDeviceTensor(DataPtrType data,
-#ifdef _MSC_VER
-                                      const IndexT (&sizes)[Dim]);
-#else
-                                      const IndexT sizes[Dim]);
-#endif
-
-  /// Constructor that takes arbitrary size/stride arrays
-  __host__ __device__ THCDeviceTensor(DataPtrType data,
-#ifdef _MSC_VER
-                                      const IndexT (&sizes)[Dim],
-                                      const IndexT (&strides)[Dim]);
-#else
-                                      const IndexT sizes[Dim],
-                                      const IndexT strides[Dim]);
-#endif
-
-  /// Returns true if the two tensors are of the same dimensionality,
-  /// size and stride.
-  template <int OtherDim>
-  __host__ __device__ bool
-  isSameSizeAndStride(
-    const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;
-
-  /// Cast to a tensor of a different type of the same size and stride
-  template <typename U>
-  __host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();
-
-  /// Const version of `cast`
-  template <typename U>
-  __host__ __device__
-  const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;
-
-  /// Returns a raw pointer to the start of our data.
-  __host__ __device__ __forceinline__ DataPtrType data() {
-    return data_;
-  }
-
-  /// Returns a raw pointer to the start of our data (const).
-  __host__ __device__ __forceinline__
-  const DataPtrType data() const {
-    return data_;
-  }
-
-  /// Cast to a different datatype
-  template <typename U>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<U>::PtrType dataAs() {
-    return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
-  }
-
-  /// Cast to a different datatype
-  template <typename U>
-  __host__ __device__ __forceinline__
-  const typename PtrTraits<const U>::PtrType dataAs() const {
-    return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
-  }
-
-  /// Returns a read/write view of a portion of our tensor.
-  __host__ __device__ __forceinline__
-  detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
-    operator[](IndexT);
-
-  /// Returns a read/write view of a portion of our tensor (const).
-  __host__ __device__ __forceinline__
-  const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
-    operator[](IndexT) const;
-
-  /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
-  /// checking.
-  __host__ __device__ __forceinline__ int getSize(int i) const {
-    return size_[i];
-  }
-
-  /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
-  /// checking.
-  __host__ __device__ __forceinline__ int getStride(int i) const {
-    return stride_[i];
-  }
-
-  /// Returns the total number of elements contained within our data
-  /// (product of `getSize(i)`)
-  __host__ __device__ ptrdiff_t numElements() const;
-
-  /// Returns the size array.
-  __host__ __device__ __forceinline__ const IndexT* sizes() const {
-    return size_;
-  }
-
-  /// Returns the stride array.
-  __host__ __device__ __forceinline__ const IndexT* strides() const {
-    return stride_;
-  }
-
-  /// Returns true if there is no padding within the tensor and no
-  /// re-ordering of the dimensions.
-  /// ~~~
-  /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
-  /// ~~~
-  __host__ __device__ bool isContiguous() const;
-
-  /// Returns whether a given dimension has only increasing stride
-  /// from the previous dimension. A tensor that was permuted by
-  /// exchanging size and stride only will fail this check.
-  /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
-  __host__ __device__ bool isConsistentlySized(int i) const;
-
-  // Returns whether at each dimension `stride <= size`.
-  // If this is not the case then iterating once over the size space will
-  // touch the same memory locations multiple times.
-  __host__ __device__ bool isConsistentlySized() const;
-
-  /// Returns true if the given dimension range [first, last) has no padding.
-  __host__ __device__ bool isContiguousRange(int first, int last) const;
-
-  /// Returns a tensor of the same dimension after transposing the two
-  /// dimensions given. Does not actually move elements; transposition
-  /// is made by permuting the size/stride arrays.
-  /// If the dimensions are not valid, asserts.
-  __host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-  transpose(int dim1, int dim2) const;
-
-  /// Upcast a tensor of dimension `D` to some tensor of dimension
-  /// D' > D by padding the leading dimensions by 1
-  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
-  template <int NewDim>
-  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  upcastOuter();
-
-  /// Upcast a tensor of dimension `D` to some tensor of dimension
-  /// D' > D by padding the lowest/most varying dimensions by 1
-  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
-  template <int NewDim>
-  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  upcastInner();
-
-  /// Downcast a tensor of dimension `D` to some tensor of dimension
-  /// D' < D by collapsing the leading dimensions. asserts if there is
-  /// padding on the leading dimensions.
-  template <int NewDim>
-  __host__ __device__
-  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();
-
-  /// Downcast a tensor of dimension `D` to some tensor of dimension
-  /// D' < D by collapsing the leading dimensions. asserts if there is
-  /// padding on the leading dimensions.
-  template <int NewDim>
-  __host__ __device__
-  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();
-
-  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
-  /// of this tensor, starting at `at`.
-  template <int SubDim>
-  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-  view(DataPtrType at);
-
-  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
-  /// of this tensor, starting where our data begins
-  template <int SubDim>
-  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-  view();
-
-  /// Zeroes out the tensor asynchronously. Asserts if the contents
-  /// in question are not contiguous.
-  void zero(cudaStream_t stream = 0);
-
- private:
-  /// Raw pointer to where the tensor data begins
-  DataPtrType data_;
-
-  /// Array of strides (in sizeof(T) terms) per each dimension
-  IndexT stride_[Dim];
-
-  /// Size per each dimension
-  IndexT size_[Dim];
-};
-
-namespace detail {
-
-/// Specialization for a view of a single value (0-dimensional)
-template <typename TensorType, template <typename U> class PtrTraits>
-class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
- public:
-  __host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
-  operator=(typename TensorType::DataType val) {
-    *data_ = val;
-    return *this;
-  }
-
-  // operator T&
-  __host__ __device__ operator typename TensorType::DataType&() {
-    return *data_;
-  }
-
-  // const operator T& returning const T&
-  __host__ __device__ operator const typename TensorType::DataType&() const {
-    return *data_;
-  }
-
-  // operator& returning T*
-  __host__ __device__ typename TensorType::DataType* operator&() {
-    return data_;
-  }
-
-  // const operator& returning const T*
-  __host__ __device__ const typename TensorType::DataType* operator&() const {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice.
-  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice (const).
-  __host__ __device__ __forceinline__
-  const typename TensorType::DataPtrType data() const {
-    return data_;
-  }
-
-  /// Cast to a different datatype.
-  template <typename T>
-  __host__ __device__ T& as() {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype (const).
-  template <typename T>
-  __host__ __device__ const T& as() const {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<T>::PtrType dataAs() {
-    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
-  }
-
-  /// Cast to a different datatype (const)
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<const T>::PtrType dataAs() const {
-    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
-  }
-
-  /// Use the texture cache for reads
-  __device__ __forceinline__ typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(data_);
-#else
-    return *data_;
-#endif
-  }
-
-  /// Use the texture cache for reads; cast as a particular type
-  template <typename T>
-  __device__ __forceinline__ T ldgAs() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(dataAs<T>());
-#else
-    return as<T>();
-#endif
-  }
-
-  private:
-  /// One dimension greater can create us
-  friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;
-
-  /// Our parent tensor can create us
-  friend class THCDeviceTensor<typename TensorType::DataType,
-                               1,
-                               typename TensorType::IndexType,
-                               PtrTraits>;
-
-  __host__ __device__ __forceinline__ THCDeviceSubTensor(
-    TensorType& t,
-    typename TensorType::DataPtrType data)
-      : tensor_(t),
-        data_(data) {
-  }
-
-  /// The tensor we're referencing
-  TensorType& tensor_;
-
-  /// Where our value is located
-  typename TensorType::DataPtrType const data_;
-};
-
-/// A `SubDim`-rank slice of a parent THCDeviceTensor
-template <typename TensorType,
-          int SubDim,
-          template <typename U> class PtrTraits>
-class THCDeviceSubTensor {
- public:
-  /// Returns a view of the data located at our offset (the dimension
-  /// `SubDim` - 1 tensor).
-  __host__ __device__ __forceinline__
-  THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
-    operator[](typename TensorType::IndexType index) {
-    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
-      tensor_,
-      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
-  }
-
-  /// Returns a view of the data located at our offset (the dimension
-  /// `SubDim` - 1 tensor) (const).
-  __host__ __device__ __forceinline__
-  const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
-    operator[](typename TensorType::IndexType index) const {
-    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
-      tensor_,
-      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
-  }
-
-  // operator& returning T*
-  __host__ __device__ typename TensorType::DataType* operator&() {
-    return data_;
-  }
-
-  // const operator& returning const T*
-  __host__ __device__ const typename TensorType::DataType* operator&() const {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice.
-  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice (const).
-  __host__ __device__ __forceinline__
-  const typename TensorType::DataPtrType data() const {
-    return data_;
-  }
-
-  /// Cast to a different datatype.
-  template <typename T>
-  __host__ __device__ T& as() {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype (const).
-  template <typename T>
-  __host__ __device__ const T& as() const {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<T>::PtrType dataAs() {
-    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
-  }
-
-  /// Cast to a different datatype (const)
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<const T>::PtrType dataAs() const {
-    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
-  }
-
-  /// Use the texture cache for reads
-  __device__ __forceinline__ typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(data_);
-#else
-    return *data_;
-#endif
-  }
-
-  /// Use the texture cache for reads; cast as a particular type
-  template <typename T>
-  __device__ __forceinline__ T ldgAs() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(dataAs<T>());
-#else
-    return as<T>();
-#endif
-  }
-
-  /// Returns a tensor that is a view of the SubDim-dimensional slice
-  /// of this tensor, starting where our data begins
-  THCDeviceTensor<typename TensorType::DataType,
-               SubDim,
-               typename TensorType::IndexType,
-               PtrTraits> view() {
-    return tensor_.template view<SubDim>(data_);
-  }
-
- private:
-  /// One dimension greater can create us
-  friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;
-
-  /// Our parent tensor can create us
-  friend class
-  THCDeviceTensor<typename TensorType::DataType,
-               TensorType::NumDim,
-               typename TensorType::IndexType,
-               PtrTraits>;
-
-  __host__ __device__ __forceinline__ THCDeviceSubTensor(
-    TensorType& t,
-    typename TensorType::DataPtrType data)
-      : tensor_(t),
-        data_(data) {
-  }
-
-  /// The tensor we're referencing
-  TensorType& tensor_;
-
-  /// The start of our sub-region
-  typename TensorType::DataPtrType const data_;
-};
-
-} // namespace detail
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ __forceinline__
-detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
-                        Dim - 1, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
-  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
-    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
-      *this, data_)[index]);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ __forceinline__
-const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
-                              Dim - 1, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
-  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
-    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
-      const_cast<TensorType&>(*this), data_)[index]);
-}
-
-#include "THCDeviceTensor-inl.cuh"
-
-#endif // THC_DEVICE_TENSOR_INC
--- a/encoding/kernel/include/THCDeviceTensorUtils-inl.cuh
+++ b/encoding/kernel/include/THCDeviceTensorUtils-inl.cuh
-namespace detail {
-
-// Add a layer of SFINAE to support static_assert
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct UpcastTHCRoot {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t);
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct UpcastTHC :
-      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
-};
-
-// Never instantiated SFINAE purposes only
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
-      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
-      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t) {
-    thc_static_assert(NewDim > Dim);
-    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
-      template upcastOuter<NewDim>();
-  }
-};
-
-// Add a layer of SFINAE to support static_assert
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct DowncastTHCRoot {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t);
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct DowncastTHC :
-      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
-};
-
-// Never instantiated SFINAE purposes only
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
-      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
-      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t) {
-    thc_static_assert(NewDim < Dim);
-    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
-      template downcastOuter<NewDim>();
-  }
-};
-
-} // namespace detail
-
-#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i)                              \
-  case i:                                                               \
-  if (NewDim > i) {                                                     \
-    return detail::UpcastTHC<T, i, IndexT,                              \
-                             PtrTraits, NewDim, (NewDim > i)>::         \
-      make(state, t);                                                   \
-  } else if (NewDim == i) {                                             \
-    return toDeviceTensor<T, NewDim, IndexT, PtrTraits>(state, t);      \
-  } else {                                                              \
-    return detail::DowncastTHC<T, i, IndexT,                            \
-                               PtrTraits, NewDim, (NewDim < i)>::       \
-      make(state, t);                                                   \
-  }                                                                     \
-  /* break; */
-
-template <typename T, int NewDim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t) {
-  switch (THCudaTensor_nDimension(state, t)) {
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(1);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(2);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(3);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(4);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(5);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(6);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(7);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(8);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(9);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(10);
-    default:
-      ;
-  }
-
-  // Not implemented
-  THError("THCDeviceTensor dimension size not supported");
-  return NULL; /* never enters this piece, appeasing compiler warnings */
-}
-
-#undef SWITCH_UNROLL_CUDA_CAST_FACTORY
--- a/encoding/kernel/include/THCDeviceTensorUtils.cu
+++ b/encoding/kernel/include/THCDeviceTensorUtils.cu
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
-#else
-
-/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
-/// error if the dimensionality does not match exactly.
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t);
-
-template <typename T, int Dim, typename IndexT>
-THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim>
-THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  if (Dim != THCTensor_(nDimension)(state, t)) {
-    THError("THCudaTensor dimension mismatch");
-  }
-  // Determine the maximum offset into the tensor achievable; `IndexT`
-  // must be smaller than this type in order to use it.
-  ptrdiff_t maxOffset = 0;
-  IndexT sizes[Dim];
-  IndexT strides[Dim];
-
-  for (int i = 0; i < Dim; ++i) {
-    int64_t size = THCTensor_(size)(state, t, i);
-    int64_t stride = THCTensor_(stride)(state, t, i);
-
-    maxOffset += (size - 1) * stride;
-
-    sizes[i] = (IndexT) size;
-    strides[i] = (IndexT) stride;
-  }
-
-  if (maxOffset > std::numeric_limits<IndexT>::max()) {
-    THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
-  }
-
-  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
-    THCTensor_(data)(state, t), sizes, strides);
-}
-
-#endif
--- a/encoding/kernel/include/THCDeviceTensorUtils.cuh
+++ b/encoding/kernel/include/THCDeviceTensorUtils.cuh
-#ifndef THC_DEVICE_TENSOR_UTILS_INC
-#define THC_DEVICE_TENSOR_UTILS_INC
-
-#include "THCDeviceTensor.cuh"
-#include "THCTensor.h"
-#include <limits>
-
-/// Constructs a DeviceTensor initialized from a THCudaTensor by
-/// upcasting or downcasting the tensor to that of a different
-/// dimension.
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t);
-
-template <typename T, int Dim, typename IndexT>
-THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t) {
-  return toDeviceTensorCast<T, Dim, IndexT, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim>
-THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t) {
-  return toDeviceTensorCast<T, Dim, int, DefaultPtrTraits>(state, t);
-}
-
-#include "generic/THCDeviceTensorUtils.cu"
-#include "THCGenerateAllTypes.h"
-
-#include "THCDeviceTensorUtils-inl.cuh"
-
-#endif // THC_DEVICE_TENSOR_UTILS_INC
--- a/encoding/kernel/include/generic/THCDeviceTensorUtils.cu
+++ b/encoding/kernel/include/generic/THCDeviceTensorUtils.cu
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
-#else
-
-/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
-/// error if the dimensionality does not match exactly.
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t);
-
-template <typename T, int Dim, typename IndexT>
-THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim>
-THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  if (Dim != THCTensor_(nDimension)(state, t)) {
-    THError("THCudaTensor dimension mismatch");
-  }
-  // Determine the maximum offset into the tensor achievable; `IndexT`
-  // must be smaller than this type in order to use it.
-  ptrdiff_t maxOffset = 0;
-  IndexT sizes[Dim];
-  IndexT strides[Dim];
-
-  for (int i = 0; i < Dim; ++i) {
-    int64_t size = THCTensor_(size)(state, t, i);
-    int64_t stride = THCTensor_(stride)(state, t, i);
-
-    maxOffset += (size - 1) * stride;
-
-    sizes[i] = (IndexT) size;
-    strides[i] = (IndexT) stride;
-  }
-
-  if (maxOffset > std::numeric_limits<IndexT>::max()) {
-    THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
-  }
-
-  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
-    THCTensor_(data)(state, t), sizes, strides);
-}
-
-#endif
--- a/encoding/kernel/thc_encoding.cu
+++ b/encoding/kernel/thc_encoding.cu
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#include "thc_encoding.h"
-#include "common.h"
-
-#include "generic/device_tensor.h"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/device_tensor.h"
-#include "THC/THCGenerateDoubleType.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// float
-#include "generic/encoding_utils.c"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/encoding_kernel.c"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/syncbn_kernel.c"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/pooling_kernel.c"
-#include "THC/THCGenerateFloatType.h"
-
-// double
-#include "generic/encoding_utils.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/encoding_kernel.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/syncbn_kernel.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/pooling_kernel.c"
-#include "THC/THCGenerateDoubleType.h"
-
-#ifdef __cplusplus
-}
-#endif
--- a/encoding/kernel/thc_encoding.h
+++ b/encoding/kernel/thc_encoding.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#include <THC.h>
-#include "THCDeviceTensor.cuh"
-#include "THCDeviceTensorUtils.cuh"
-
-// this symbol will be resolved automatically from PyTorch libs
-extern THCState *state;
-
-#define Encoding_(NAME) TH_CONCAT_4(Encoding_, Real, _, NAME)
-#define THCTensor        TH_CONCAT_3(TH,CReal,Tensor)
-#define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// float
-#include "generic/encoding_kernel.h"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/syncbn_kernel.h"
-#include "THC/THCGenerateFloatType.h"
-
-#include "generic/pooling_kernel.h"
-#include "THC/THCGenerateFloatType.h"
-
-// double
-#include "generic/encoding_kernel.h"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/syncbn_kernel.h"
-#include "THC/THCGenerateDoubleType.h"
-
-#include "generic/pooling_kernel.h"
-#include "THC/THCGenerateDoubleType.h"
-
-#ifdef __cplusplus
-}
-#endif
--- a/encoding/make.sh
+++ b/encoding/make.sh
-#!/usr/bin/env bash
-mkdir -p encoding/lib && cd encoding/lib
-# compile and install
-cmake ..
-make
--- a/encoding/models/__init__.py
+++ b/encoding/models/__init__.py
+from .model_zoo import get_model
+from .base import *
+from .fcn import *
+from .encnet import *
+
+def get_segmentation_model(name, **kwargs):
+    from .fcn import get_fcn
+    models = {
+        'fcn': get_fcn,
+        'encnet': get_encnet,
+    }
+    return models[name.lower()](**kwargs)
--- a/encoding/models/base.py
+++ b/encoding/models/base.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+
+import math
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.functional import upsample
+from torch.nn.parallel.data_parallel import DataParallel
+from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel.scatter_gather import scatter
+
+from .. import dilated as resnet
+from ..utils import batch_pix_accuracy, batch_intersection_union
+
+up_kwargs = {'mode': 'bilinear', 'align_corners': True}
+
+__all__ = ['BaseNet', 'EvalModule', 'MultiEvalModule']
+
+class BaseNet(nn.Module):
+    def __init__(self, nclass, backbone, aux, se_loss, dilated=True, norm_layer=None,
+                 mean=[.485, .456, .406], std=[.229, .224, .225]):
+        super(BaseNet, self).__init__()
+        self.nclass = nclass
+        self.aux = aux
+        self.se_loss = se_loss
+        self.mean = mean
+        self.std = std
+        # copying modules from pretrained models
+        if backbone == 'resnet50':
+            self.pretrained = resnet.resnet50(pretrained=True, dilated=dilated, norm_layer=norm_layer)
+        elif backbone == 'resnet101':
+            self.pretrained = resnet.resnet101(pretrained=True, dilated=dilated, norm_layer=norm_layer)
+        elif backbone == 'resnet152':
+            self.pretrained = resnet.resnet152(pretrained=True, dilated=dilated, norm_layer=norm_layer)
+        else:
+            raise RuntimeError('unknown backbone: {}'.format(backbone))
+        # bilinear upsample options
+        self._up_kwargs = up_kwargs
+
+    def base_forward(self, x):
+        x = self.pretrained.conv1(x)
+        x = self.pretrained.bn1(x)
+        x = self.pretrained.relu(x)
+        x = self.pretrained.maxpool(x)
+        c1 = self.pretrained.layer1(x)
+        c2 = self.pretrained.layer2(c1)
+        c3 = self.pretrained.layer3(c2)
+        c4 = self.pretrained.layer4(c3)
+        return c1, c2, c3, c4
+
+    def evaluate(self, x, target=None):
+        pred = self.forward(x)
+        if isinstance(pred, (tuple, list)):
+            pred = pred[0]
+        if target is None:
+            return pred
+        correct, labeled = batch_pix_accuracy(pred.data, target.data)
+        inter, union = batch_intersection_union(pred.data, target.data, self.nclass)
+        return correct, labeled, inter, union
+
+
+class EvalModule(nn.Module):
+    """Segmentation Eval Module"""
+    def __init__(self, module):
+        super(EvalModule, self).__init__()
+        self.module = module
+
+    def forward(self, *inputs, **kwargs):
+        return self.module.evaluate(*inputs, **kwargs)
+
+
+class MultiEvalModule(DataParallel):
+    """Multi-size Segmentation Eavluator"""
+    def __init__(self, module, nclass, device_ids=None,
+                 base_size=520, crop_size=480, flip=True,
+                 scales=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75]):
+        super(MultiEvalModule, self).__init__(module, device_ids)
+        self.nclass = nclass
+        self.base_size = base_size
+        self.crop_size = crop_size
+        self.scales = scales
+        self.flip = flip
+
+    def parallel_forward(self, inputs, **kwargs):
+        """Multi-GPU Mult-size Evaluation
+
+        Args:
+            inputs: list of Tensors
+        """
+        inputs = [(input.unsqueeze(0).cuda(device),) for input, device in zip(inputs, self.device_ids)]
+        replicas = self.replicate(self, self.device_ids[:len(inputs)])
+        kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+        if len(inputs) < len(kwargs):
+            inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+        elif len(kwargs) < len(inputs):
+            kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+        outputs = self.parallel_apply(replicas, inputs, kwargs)
+        return outputs
+
+    def forward(self, image):
+        """Mult-size Evaluation"""
+        # only single image is supported for evaluation
+        batch, _, h, w = image.size()
+        assert(batch == 1)
+        stride_rate = 2.0/3.0
+        crop_size = self.crop_size
+        stride = int(crop_size * stride_rate)
+        with torch.cuda.device_of(image):
+            scores = image.new().resize_(batch,self.nclass,h,w).zero_().cuda()
+
+        for scale in self.scales:
+            long_size = int(math.ceil(self.base_size * scale))
+            if h > w:
+                height = long_size
+                width = int(1.0 * w * long_size / h + 0.5)
+                short_size = width
+            else:
+                width = long_size
+                height = int(1.0 * h * long_size / w + 0.5)
+                short_size = height
+            # resize image to current size
+            cur_img = resize_image(image, height, width)
+            if scale <= 1.25 or long_size <= crop_size:# #
+                pad_img = pad_image(cur_img, self.module.mean,
+                                    self.module.std, crop_size)
+                outputs = self.module_inference(pad_img)
+                outputs = crop_image(outputs, 0, height, 0, width)
+            else:
+                if short_size < crop_size:
+                    # pad if needed
+                    pad_img = pad_image(cur_img, self.module.mean,
+                                        self.module.std, crop_size)
+                else:
+                    pad_img = cur_img
+                _,_,ph,pw = pad_img.size()
+                assert(ph >= height and pw >= width)
+                # grid forward and normalize
+                h_grids = int(math.ceil(1.0*(ph-crop_size)/stride)) + 1
+                w_grids = int(math.ceil(1.0*(pw-crop_size)/stride)) + 1
+                with torch.cuda.device_of(image):
+                    outputs = image.new().resize_(batch,self.nclass,ph,pw).zero_().cuda()
+                    count_norm = image.new().resize_(batch,1,ph,pw).zero_().cuda()
+                # grid evaluation
+                for idh in range(h_grids):
+                    for idw in range(w_grids):
+                        h0 = idh * stride
+                        w0 = idw * stride
+                        h1 = min(h0 + crop_size, ph)
+                        w1 = min(w0 + crop_size, pw)
+                        crop_img = crop_image(pad_img, h0, h1, w0, w1)
+                        # pad if needed
+                        pad_crop_img = pad_image(crop_img, self.module.mean,
+                                                 self.module.std, crop_size)
+                        output = self.module_inference(pad_crop_img)
+                        outputs[:,:,h0:h1,w0:w1] += crop_image(output,
+                            0, h1-h0, 0, w1-w0)
+                        count_norm[:,:,h0:h1,w0:w1] += 1
+                assert((count_norm==0).sum()==0)
+                outputs = outputs / count_norm
+                outputs = outputs[:,:,:height,:width]
+
+            score = resize_image(outputs, h, w)
+            scores += score
+
+        return scores
+
+    def module_inference(self, image):
+        output = self.module.evaluate(image)
+        if self.flip:
+            fimg = flip_image(image)
+            foutput = self.module.evaluate(fimg)
+            output += flip_image(foutput)
+        return output.exp()
+
+
+def resize_image(img, h, w, mode='bilinear'):
+    return F.upsample(img, (h, w), **up_kwargs)
+
+def pad_image(img, mean, std, crop_size):
+    b,c,h,w = img.size()
+    assert(c==3)
+    padh = crop_size - h if h < crop_size else 0
+    padw = crop_size - w if w < crop_size else 0
+    pad_values = -np.array(mean) / np.array(std)
+    img_pad = img.new().resize_(b,c,h+padh,w+padw)
+    #img_pad = F.pad(img, (0,padw,0,padh))
+    for i in range(c):
+        # note that pytorch pad params is in reversed orders
+        img_pad[:,i,:,:] = F.pad(img[:,i,:,:], (0, padw, 0, padh), 
+            value=pad_values[i])
+    assert(img_pad.size(2)>=crop_size and img_pad.size(3)>=crop_size)
+    return img_pad
+
+def crop_image(img, h0, h1, w0, w1):
+    return img[:,:,h0:h1,w0:w1]
+
+def flip_image(img):
+    assert(img.dim()==4)
+    with torch.cuda.device_of(img):
+        idx = torch.arange(img.size(3)-1, -1, -1).type_as(img).long()
+    return img.index_select(3, idx)
--- a/encoding/models/encnet.py
+++ b/encoding/models/encnet.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+
+import torch
+from torch.autograd import Variable
+import torch.nn as nn
+from torch.nn.functional import upsample
+
+import encoding
+from .base import BaseNet
+from .fcn import FCNHead
+
+__all__ = ['EncNet', 'EncModule', 'get_encnet', 'get_encnet_resnet50_pcontext']
+
+class EncNet(BaseNet):
+    def __init__(self, nclass, backbone, aux=True, se_loss=True,
+                 norm_layer=nn.BatchNorm2d, **kwargs):
+        super(EncNet, self).__init__(nclass, backbone, aux, se_loss, norm_layer=norm_layer)
+        self.head = EncHead(self.nclass, in_channels=2048, se_loss=se_loss,
+                            norm_layer=norm_layer, up_kwargs=self._up_kwargs)
+        if aux:
+            self.auxlayer = FCNHead(1024, nclass, norm_layer=norm_layer)
+
+    def forward(self, x):
+        imsize = x.size()[2:]
+        #features = self.base_forward(x)
+        _, _, c3, c4 = self.base_forward(x)
+
+        x = list(self.head(c4))
+        x[0] = upsample(x[0], imsize, **self._up_kwargs)
+        if self.aux:
+            auxout = self.auxlayer(c3)
+            auxout = upsample(auxout, imsize, **self._up_kwargs)
+            x.append(auxout)
+        return tuple(x)
+
+
+class EncModule(nn.Module):
+    def __init__(self, in_channels, nclass, ncodes=32, se_loss=True, norm_layer=None):
+        super(EncModule, self).__init__()
+        if isinstance(norm_layer, encoding.nn.BatchNorm2d):
+            norm_layer = encoding.nn.BatchNorm1d
+        else:
+            norm_layer = nn.BatchNorm1d
+        self.se_loss = se_loss
+        self.encoding = nn.Sequential(
+            encoding.nn.Encoding(D=in_channels, K=ncodes),
+            norm_layer(ncodes),
+            nn.ReLU(inplace=True),
+            encoding.nn.Sum(dim=1))
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels),
+            nn.Sigmoid())
+        if self.se_loss:
+            self.selayer = nn.Linear(in_channels, nclass)
+
+    def forward(self, x):
+        en = self.encoding(x)
+        b, c, _, _ = x.size()
+        gamma = self.fc(en)
+        y = gamma.view(b, c, 1, 1)
+        # residual ?
+        outputs = [x + x * y]
+        if self.se_loss:
+            outputs.append(self.selayer(en))
+        return tuple(outputs)
+
+
+class EncHead(nn.Module):
+    def __init__(self, out_channels, in_channels, se_loss=True,
+                 norm_layer=None, up_kwargs=None):
+        super(EncHead, self).__init__()
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(in_channels, 512, 3, padding=1, bias=False),
+            norm_layer(512),
+            nn.ReLU(True))
+        self.encmodule = EncModule(512, out_channels, ncodes=32,
+            se_loss=se_loss, norm_layer=norm_layer)
+        self.dropout = nn.Dropout2d(0.1, False)
+        self.conv6 = nn.Conv2d(512, out_channels, 1)
+        self.se_loss = se_loss
+
+    def forward(self, x):
+        x = self.conv5(x)
+        outs = list(self.encmodule(x))
+        outs[0] = self.conv6(self.dropout(outs[0]))
+        return tuple(outs)
+
+
+def get_encnet(dataset='pascal_voc', backbone='resnet50', pretrained=False,
+               root='~/.encoding/models', **kwargs):
+    r"""EncNet model from the paper `"Context Encoding for Semantic Segmentation"
+    <https://arxiv.org/pdf/1803.08904.pdf>`_
+
+    Parameters
+    ----------
+    dataset : str, default pascal_voc
+        The dataset that model pretrained on. (pascal_voc, ade20k)
+    backbone : str, default resnet50
+        The backbone network. (resnet50, 101, 152)
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+
+    Examples
+    --------
+    >>> model = get_encnet(dataset='pascal_voc', backbone='resnet50', pretrained=False)
+    >>> print(model)
+    """
+    acronyms = {
+        'pascal_voc': 'voc',
+        'ade20k': 'ade',
+        'pcontext': 'pcontext',
+    }
+    # infer number of classes
+    from ..datasets import datasets, VOCSegmentation, VOCAugSegmentation, ADE20KSegmentation
+    model = EncNet(datasets[dataset.lower()].NUM_CLASS, backbone=backbone, **kwargs)
+    if pretrained:
+        from .model_store import get_model_file
+        model.load_state_dict(torch.load(
+            get_model_file('encnet_%s_%s'%(backbone, acronyms[dataset]), root=root)))
+    return model
+
+def get_encnet_resnet50_pcontext(pretrained=False, root='~/.encoding/models', **kwargs):
+    r"""EncNet-PSP model from the paper `"Context Encoding for Semantic Segmentation"
+    <https://arxiv.org/pdf/1803.08904.pdf>`_
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+
+    Examples
+    --------
+    >>> model = get_encnet_resnet50_pcontext(pretrained=True)
+    >>> print(model)
+    """
+    return get_encnet('pcontext', 'resnet50', pretrained)
--- a/encoding/models/fcn.py
+++ b/encoding/models/fcn.py
+###########################################################################
+# Created by: Hang Zhang 
+# Email: zhang.hang@rutgers.edu 
+# Copyright (c) 2017
+###########################################################################
+from __future__ import division
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.functional import upsample
+
+from .base import BaseNet
+
+__all__ = ['FCN', 'get_fcn', 'get_fcn_resnet50_pcontext', 'get_fcn_resnet50_ade']
+
+class FCN(BaseNet):
+    r"""Fully Convolutional Networks for Semantic Segmentation
+
+    Parameters
+    ----------
+    nclass : int
+        Number of categories for the training dataset.
+    backbone : string
+        Pre-trained dilated backbone network type (default:'resnet50'; 'resnet50',
+        'resnet101' or 'resnet152').
+    norm_layer : object
+        Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
+
+
+    Reference:
+
+        Long, Jonathan, Evan Shelhamer, and Trevor Darrell. "Fully convolutional networks
+        for semantic segmentation." *CVPR*, 2015
+
+    Examples
+    --------
+    >>> model = FCN(nclass=21, backbone='resnet50')
+    >>> print(model)
+    """
+    def __init__(self, nclass, backbone, aux=True, se_loss=False, norm_layer=nn.BatchNorm2d, **kwargs):
+        super(FCN, self).__init__(nclass, backbone, aux, se_loss, norm_layer=norm_layer)
+        self.head = FCNHead(2048, nclass, norm_layer)
+        if aux:
+            self.auxlayer = FCNHead(1024, nclass, norm_layer)
+
+    def forward(self, x):
+        imsize = x.size()[2:]
+        _, _, c3, c4 = self.base_forward(x)
+
+        x = self.head(c4)
+        x = upsample(x, imsize, **self._up_kwargs)
+        outputs = [x]
+        if self.aux:
+            auxout = self.auxlayer(c3)
+            auxout = upsample(auxout, imsize, **self._up_kwargs)
+            outputs.append(auxout)
+        return tuple(outputs)
+
+        
+class FCNHead(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer):
+        super(FCNHead, self).__init__()
+        inter_channels = in_channels // 4
+        self.conv5 = nn.Sequential(nn.Conv2d(in_channels, inter_channels, 3, padding=1),
+                                   norm_layer(inter_channels),
+                                   nn.ReLU(),
+                                   nn.Dropout2d(0.1, False),
+                                   nn.Conv2d(inter_channels, out_channels, 1))
+
+    def forward(self, x):
+        return self.conv5(x)
+
+
+def get_fcn(dataset='pascal_voc', backbone='resnet50', pretrained=False,
+            root='~/.encoding/models', **kwargs):
+    r"""FCN model from the paper `"Fully Convolutional Network for semantic segmentation"
+    <https://people.eecs.berkeley.edu/~jonlong/long_shelhamer_fcn.pdf>`_
+    Parameters
+    ----------
+    dataset : str, default pascal_voc
+        The dataset that model pretrained on. (pascal_voc, ade20k)
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+    Examples
+    --------
+    >>> model = get_fcn(dataset='pascal_voc', backbone='resnet50', pretrained=False)
+    >>> print(model)
+    """
+    acronyms = {
+        'pascal_voc': 'voc',
+        'pascal_aug': 'voc',
+        'pcontext': 'pcontext',
+        'ade20k': 'ade',
+    }
+    # infer number of classes
+    from ..datasets import datasets, VOCSegmentation, VOCAugSegmentation, ADE20KSegmentation
+    model = FCN(datasets[dataset.lower()].NUM_CLASS, backbone=backbone, **kwargs)
+    if pretrained:
+        from .model_store import get_model_file
+        model.load_state_dict(torch.load(
+            get_model_file('fcn_%s_%s'%(backbone, acronyms[dataset]), root=root)),
+            strict= False)
+    return model
+
+def get_fcn_resnet50_pcontext(pretrained=False, root='~/.encoding/models', **kwargs):
+    r"""EncNet-PSP model from the paper `"Context Encoding for Semantic Segmentation"
+    <https://arxiv.org/pdf/1803.08904.pdf>`_
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+
+    Examples
+    --------
+    >>> model = get_fcn_resnet50_pcontext(pretrained=True)
+    >>> print(model)
+    """
+    return get_fcn('pcontext', 'resnet50', pretrained)
+
+def get_fcn_resnet50_ade(pretrained=False, root='~/.encoding/models', **kwargs):
+    r"""EncNet-PSP model from the paper `"Context Encoding for Semantic Segmentation"
+    <https://arxiv.org/pdf/1803.08904.pdf>`_
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+
+    Examples
+    --------
+    >>> model = get_fcn_resnet50_ade(pretrained=True)
+    >>> print(model)
+    """
+    return get_fcn('ade20k', 'resnet50', pretrained)
--- a/encoding/models/model_store.py
+++ b/encoding/models/model_store.py
+"""Model store which provides pretrained models."""
+from __future__ import print_function
+__all__ = ['get_model_file', 'purge']
+import os
+import zipfile
+
+from ..utils import download, check_sha1
+
+_model_sha1 = {name: checksum for checksum, name in [
+    ('eeed8e582f0fdccdba8579e7490570adc6d85c7c', 'fcn_resnet50_pcontext'),
+    ('969062a5aad2d1d983bae2f9e412578b62610114', 'encnet_resnet50_pcontext'),
+    ('fc8c0b795abf0133700c2d4265d2f9edab7eb6cc', 'fcn_resnet50_ade'),
+    ]}
+
+encoding_repo_url = 'https://hangzh.s3.amazonaws.com/'
+_url_format = '{repo_url}encoding/models/{file_name}.zip'
+
+def short_hash(name):
+    if name not in _model_sha1:
+        raise ValueError('Pretrained model for {name} is not available.'.format(name=name))
+    return _model_sha1[name][:8]
+
+def get_model_file(name, root=os.path.join('~', '.encoding', 'models')):
+    r"""Return location for the pretrained on local file system.
+
+    This function will download from online model zoo when model cannot be found or has mismatch.
+    The root directory will be created if it doesn't exist.
+
+    Parameters
+    ----------
+    name : str
+        Name of the model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+    Returns
+    -------
+    file_path
+        Path to the requested pretrained model file.
+    """
+    file_name = '{name}-{short_hash}'.format(name=name, short_hash=short_hash(name))
+    root = os.path.expanduser(root)
+    file_path = os.path.join(root, file_name+'.pth')
+    sha1_hash = _model_sha1[name]
+    if os.path.exists(file_path):
+        if check_sha1(file_path, sha1_hash):
+            return file_path
+        else:
+            print('Mismatch in the content of model file detected. Downloading again.')
+    else:
+        print('Model file is not found. Downloading.')
+
+    if not os.path.exists(root):
+        os.makedirs(root)
+
+    zip_file_path = os.path.join(root, file_name+'.zip')
+    repo_url = os.environ.get('ENCODING_REPO', encoding_repo_url)
+    if repo_url[-1] != '/':
+        repo_url = repo_url + '/'
+    download(_url_format.format(repo_url=repo_url, file_name=file_name),
+             path=zip_file_path,
+             overwrite=True)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(root)
+    os.remove(zip_file_path)
+
+    if check_sha1(file_path, sha1_hash):
+        return file_path
+    else:
+        raise ValueError('Downloaded file has different hash. Please try again.')
+
+def purge(root=os.path.join('~', '.encoding', 'models')):
+    r"""Purge all pretrained model files in local file store.
+
+    Parameters
+    ----------
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+    """
+    root = os.path.expanduser(root)
+    files = os.listdir(root)
+    for f in files:
+        if f.endswith(".pth"):
+            os.remove(os.path.join(root, f))
+
+def pretrained_model_list():
+    return list(_model_sha1.keys())
--- a/encoding/models/model_zoo.py
+++ b/encoding/models/model_zoo.py
+# pylint: disable=wildcard-import, unused-wildcard-import
+
+from .fcn import *
+from .encnet import *
+
+__all__ = ['get_model']
+
+
+def get_model(name, **kwargs):
+    """Returns a pre-defined model by name
+
+    Parameters
+    ----------
+    name : str
+        Name of the model.
+    pretrained : bool
+        Whether to load the pretrained weights for model.
+    root : str, default '~/.encoding/models'
+        Location for keeping the model parameters.
+
+    Returns
+    -------
+    Module:
+        The model.
+    """
+    models = {
+        'fcn_resnet50_pcontext': get_fcn_resnet50_pcontext,
+        'encnet_resnet50_pcontext': get_encnet_resnet50_pcontext,
+        'fcn_resnet50_ade': get_fcn_resnet50_ade,
+        }
+    name = name.lower()
+    if name not in models:
+        raise ValueError('%s\n\t%s' % (str(e), '\n\t'.join(sorted(models.keys()))))
+    net = models[name](**kwargs)
+    return net
--- a/encoding/nn/customize.py
+++ b/encoding/nn/customize.py
@@ -11,13 +11,15 @@
 """Encoding Custermized NN Module"""
 import torch
 from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d, \
-    NLLLoss, BCELoss, CrossEntropyLoss
+    NLLLoss, BCELoss, CrossEntropyLoss, AvgPool2d, MaxPool2d, Parameter
 from torch.nn import functional as F
-
+from torch.autograd import Variable
 from .syncbn import BatchNorm2d

+torch_ver = torch.__version__[:3]
+
 __all__ = ['GramMatrix', 'SegmentationLosses', 'View', 'Sum', 'Mean',
-           'Normalize', 'PyramidPooling']
+           'Normalize']


 class GramMatrix(Module):
@@ -39,39 +41,51 @@ def softmax_crossentropy(input, target, weight, size_average, ignore_index, redu

 class SegmentationLosses(CrossEntropyLoss):
    """2D Cross Entropy Loss with Auxilary Loss"""
-    def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
+    def __init__(self, se_loss=False, se_weight=0.1, nclass=-1,
+                 aux=False, aux_weight=0.2, weight=None,
+                 size_average=True, ignore_index=-1):
        super(SegmentationLosses, self).__init__(weight, size_average, ignore_index)
+        self.se_loss = se_loss
        self.aux = aux
+        self.nclass = nclass
+        self.se_weight = se_weight
        self.aux_weight = aux_weight
+        self.bceloss = BCELoss(weight, size_average) 

    def forward(self, *inputs):
-        if not self.aux:
+        if not self.se_loss and not self.aux:
            return super(SegmentationLosses, self).forward(*inputs)
-        pred1, pred2, target = tuple(inputs)
-        loss1 = super(SegmentationLosses, self).forward(pred1, target)
-        loss2 = super(SegmentationLosses, self).forward(pred2, target)
-        return loss1 + self.aux_weight * loss2
-
-"""
-class SegmentationLosses(Module):
-    def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
-        super(SegmentationLosses, self).__init__()
-        self.aux = aux
-        self.aux_weight = aux_weight
-        # Somehow the size averge is not handled correctly on multi-gpu, so we average by ourself.
-        self.nll_loss = NLLLoss(weight, ignore_index=ignore_index, reduce=True)
-
-    def _forward_each(self, inputs, targets):
-        return self.nll_loss(F.log_softmax(inputs, dim=1), targets)
-
-    def forward(self, *inputs):
-        if not self.aux:
-            return self._forward_each(*inputs)
-        pred1, pred2, target = tuple(inputs)
-        loss1 = self._forward_each(pred1, target)
-        loss2 = self._forward_each(pred2, target)
-        return loss1 + self.aux_weight * loss2
-"""
+        elif not self.se_loss:
+            pred1, pred2, target = tuple(inputs)
+            loss1 = super(SegmentationLosses, self).forward(pred1, target)
+            loss2 = super(SegmentationLosses, self).forward(pred2, target)
+            return loss1 + self.aux_weight * loss2
+        elif not self.aux:
+            pred, se_pred, target = tuple(inputs)
+            se_target = self._get_batch_label_vector(target, nclass=self.nclass).type_as(pred)
+            loss1 = super(SegmentationLosses, self).forward(pred, target)
+            loss2 = self.bceloss(F.sigmoid(se_pred), se_target)
+            return loss1 + self.se_weight * loss2
+        else:
+            pred1, se_pred, pred2, target = tuple(inputs)
+            se_target = self._get_batch_label_vector(target, nclass=self.nclass).type_as(pred1)
+            loss1 = super(SegmentationLosses, self).forward(pred1, target)
+            loss2 = super(SegmentationLosses, self).forward(pred2, target)
+            loss3 = self.bceloss(F.sigmoid(se_pred), se_target)
+            return loss1 + self.aux_weight * loss2 + self.se_weight * loss3
+
+    @staticmethod
+    def _get_batch_label_vector(target, nclass):
+        # target is a 3D Variable BxHxW, output is 2D BxnClass
+        batch = target.size(0)
+        tvect = Variable(torch.zeros(batch, nclass))
+        for i in range(batch):
+            hist = torch.histc(target[i].cpu().data.float(), 
+                               bins=nclass, min=0,
+                               max=nclass-1)
+            vect = hist>0
+            tvect[i] = vect
+        return tvect


 class View(Module):
@@ -135,45 +149,3 @@ class Normalize(Module):

    def forward(self, x):
        return F.normalize(x, self.p, self.dim, eps=1e-10)
-
-
-class PyramidPooling(Module):
-    """
-    Reference:
-        Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
-    """
-    def __init__(self, in_channels):
-        super(PyramidPooling, self).__init__()
-        self.pool1 = AdaptiveAvgPool2d(1)
-        self.pool2 = AdaptiveAvgPool2d(2)
-        self.pool3 = AdaptiveAvgPool2d(3)
-        self.pool4 = AdaptiveAvgPool2d(6)
-
-        out_channels = int(in_channels/4)
-        self.conv1 = Sequential(Conv2d(in_channels, out_channels, 1),
-                                BatchNorm2d(out_channels),
-                                ReLU(True))
-        self.conv2 = Sequential(Conv2d(in_channels, out_channels, 1),
-                                BatchNorm2d(out_channels),
-                                ReLU(True))
-        self.conv3 = Sequential(Conv2d(in_channels, out_channels, 1),
-                                BatchNorm2d(out_channels),
-                                ReLU(True))
-        self.conv4 = Sequential(Conv2d(in_channels, out_channels, 1),
-                                BatchNorm2d(out_channels),
-                                ReLU(True))
-
-    def _cat_each(self, x, feat1, feat2, feat3, feat4):
-        assert(len(x) == len(feat1))
-        z = []
-        for i in range(len(x)):
-            z.append(torch.cat((x[i], feat1[i], feat2[i], feat3[i], feat4[i]), 1))
-        return z
-
-    def forward(self, x):
-        _, _, h, w = x.size()
-        feat1 = F.upsample(self.conv1(self.pool1(x)), (h, w), mode='bilinear')
-        feat2 = F.upsample(self.conv2(self.pool2(x)), (h, w), mode='bilinear')
-        feat3 = F.upsample(self.conv3(self.pool3(x)), (h, w), mode='bilinear')
-        feat4 = F.upsample(self.conv4(self.pool4(x)), (h, w), mode='bilinear')
-        return torch.cat((x, feat1, feat2, feat3, feat4), 1)
--- a/encoding/nn/encoding.py
+++ b/encoding/nn/encoding.py
@@ -15,9 +15,9 @@ import torch.nn.functional as F
 from torch.autograd import Variable
 from torch.nn.modules.utils import _pair

-from ..functions import scaledL2, aggregate, dilatedavgpool2d
+from ..functions import scaledL2, aggregate

-__all__ = ['Encoding', 'EncodingDrop', 'Inspiration', 'DilatedAvgPool2d', 'UpsampleConv2d']
+__all__ = ['Encoding', 'EncodingDrop', 'Inspiration', 'UpsampleConv2d']

 class Encoding(Module):
    r"""
@@ -203,82 +203,6 @@ class Inspiration(Module):
            + 'N x ' + str(self.C) + ')'


-class DilatedAvgPool2d(Module):
-    r"""We provide Dilated Average Pooling for the dilation of Densenet as
-    in :class:`encoding.dilated.DenseNet`.
-
-    Reference:
-
-        Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
-        Amit Agrawal. “Context Encoding for Semantic Segmentation.
-        *The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
-
-    Applies a 2D average pooling over an input signal composed of several input planes.
-
-    In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
-    output :math:`(B, C, H_{out}, W_{out})`, :attr:`kernel_size` :math:`(k_H,k_W)`,
-    :attr:`stride` :math:`(s_H,s_W)` :attr:`dilation` :math:`(d_H,d_W)`
-    can be precisely described as:
-
-    .. math::
-
-        \begin{array}{ll}
-        out(b, c, h, w)  = 1 / (k_H \cdot k_W) \cdot
-        \sum_{{m}=0}^{k_H-1} \sum_{{n}=0}^{k_W-1}
-        input(b, c, s_H \cdot h + d_H \cdot m, s_W \cdot w + d_W \cdot n)
-        \end{array}
-
-    | If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
-      for :attr:`padding` number of points
-
-    | The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`,
-      :attr:`dilation` can either be:
-
-        - a single ``int`` -- in which case the same value is used for the height
-          and width dimension
-        - a ``tuple`` of two ints -- in which case, the first `int` is used for
-          the height dimension, and the second `int` for the width dimension
-
-    Args:
-        kernel_size: the size of the window
-        stride: the stride of the window. Default value is :attr:`kernel_size`
-        padding: implicit zero padding to be added on both sides
-        dilation: the dilation parameter similar to Conv2d
-
-    Shape:
-        - Input: :math:`(B, C, H_{in}, W_{in})`
-        - Output: :math:`(B, C, H_{out}, W_{out})` where
-          :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
-          :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
-          For :attr:`stride=1`, the output featuremap preserves the same size as input.
-
-    Examples::
-
-        >>> # pool of square window of size=3, stride=2, dilation=2
-        >>> m = nn.DilatedAvgPool2d(3, stride=2, dilation=2)
-        >>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
-        >>> output = m(input)
-
-    """
-    def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
-        super(DilatedAvgPool2d, self).__init__()
-        self.kernel_size = kernel_size
-        self.stride = stride or kernel_size
-        self.padding = padding
-        self.dilation = dilation
-
-    def forward(self, input):
-        return dilatedavgpool2d(input, self.kernel_size, self.stride,
-                                self.padding, self.dilation)
-
-    def __repr__(self):
-        return self.__class__.__name__ + ' (' \
-            + 'size=' + str(self.kernel_size) \
-            + ', stride=' + str(self.stride) \
-            + ', padding=' + str(self.padding) \
-            + ', dilation=' + str(self.dilation) + ')'
-
-
 class UpsampleConv2d(Module):
    r"""
    To avoid the checkerboard artifacts of standard Fractionally-strided Convolution,

--- a/encoding/nn/syncbn.py
+++ b/encoding/nn/syncbn.py
@@ -23,34 +23,28 @@ from ..functions import *
 from ..parallel import allreduce
 from .comm import SyncMaster

+
 __all__ = ['BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'Module', 'Sequential', 'Conv1d',
           'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d',
           'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']

-# Adapt from https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
-_ChildMessage = collections.namedtuple('Message', ['sum', 'ssum', 'sum_size'])
-_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])
-
 class _SyncBatchNorm(_BatchNorm):
-    def __init__(self, num_features, eps=1e-5, momentum=0.001, affine=True):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
        super(_SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)

        self._sync_master = SyncMaster(self._data_parallel_master)
-
-        self._is_parallel = False
        self._parallel_id = None
        self._slave_pipe = None

    def forward(self, input):
-        # If it is not parallel computation or is in evaluation mode, use PyTorch's implementation.
-        if not (self._is_parallel and self.training):
+        if not self.training:
            return batch_norm(
                input, self.running_mean, self.running_var, self.weight, self.bias,
                self.training, self.momentum, self.eps)

        # Resize the input to (B, C, -1).
        input_shape = input.size()
-        input = input.view(input.size(0), self.num_features, -1)
+        input = input.view(input_shape[0], self.num_features, -1)

        # sum(x) and sum(x^2)
        N = input.size(0) * input.size(2)
@@ -62,11 +56,9 @@ class _SyncBatchNorm(_BatchNorm):
        else:
            mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(xsum, xsqsum, N))
        # forward
-        return batchnormtrain(input, self.weight, self.bias, mean, 1.0/inv_std).view(input_shape)
-
+        return batchnormtrain(input, mean, 1.0/inv_std, self.weight, self.bias).view(input_shape)

    def __data_parallel_replicate__(self, ctx, copy_id):
-        self._is_parallel = True
        self._parallel_id = copy_id

        # parallel_id == 0 means master device.
@@ -110,7 +102,12 @@ class _SyncBatchNorm(_BatchNorm):
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data

-        return mean, bias_var.clamp(self.eps) ** -0.5
+        return mean, (bias_var + self.eps) ** -0.5
+
+
+# API adapted from https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
+_ChildMessage = collections.namedtuple('Message', ['sum', 'ssum', 'sum_size'])
+_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])


 class BatchNorm1d(_SyncBatchNorm):
@@ -193,12 +190,11 @@ class BatchNorm3d(_SyncBatchNorm):

 class SharedTensor(object):
    """Shared Tensor for cross GPU all reduce operation"""
-    def __init__(self, nGPUs, op):
+    def __init__(self, nGPUs):
        self.mutex = threading.Lock()
        self.all_tasks_done = threading.Condition(self.mutex)
        self.nGPUs = nGPUs
        self._clear()
-        self.op = op

    def _clear(self):
        self.N = 0
@@ -206,9 +202,7 @@ class SharedTensor(object):
        self.push_tasks = self.nGPUs
        self.reduce_tasks = self.nGPUs

-    def __call__(self, *inputs):
-        if self.nGPUs <= 1:
-            return tuple(inputs)
+    def push(self, *inputs):
        # push from device
        with self.mutex:
            if self.push_tasks == 0:
@@ -223,13 +217,15 @@ class SharedTensor(object):
                self.all_tasks_done.notify_all()
            while self.push_tasks:
                self.all_tasks_done.wait()
+
+    def pull(self, igpu):
        # pull from device
        with self.mutex:
            if igpu == 0:
                assert(len(self.dict) == self.nGPUs)
                # flatten the tensors
                self.list = [t for i in range(len(self.dict)) for t in self.dict[i]]
-                self.outlist = self.op(2, *self.list)
+                self.outlist = allreduce(2, *self.list)
                self.reduce_tasks -= 1
            else:
                self.reduce_tasks -= 1