v0.4.2

2d21747a · Zhang · 7e19143c · 7e19143c · 7e19143c · 7e19143c
Commit 2d21747a authored Jun 04, 2018 by Zhang
20 changed files
--- a/encoding/kernel/generic/syncbn_kernel.h
+++ b/encoding/kernel/generic/syncbn_kernel.h
-/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- * Created by: Hang Zhang
- * ECE Department, Rutgers University
- * Email: zhang.hang@rutgers.edu
- * Copyright (c) 2017
- *
- * This source code is licensed under the MIT-style license found in the
- * LICENSE file in the root directory of this source tree 
- *+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
- */
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/syncbn_kernel.h"
-#else
-
-void Encoding_(BatchNorm_Forward)(THCState *state, 
-    THCTensor *output_, THCTensor *input_, 
-    THCTensor *mean_, THCTensor *invstd_,
-    THCTensor *gamma_, THCTensor *beta_);
-
-void Encoding_(BatchNorm_Backward)(THCState *state, 
-    THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_, 
-    THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_, 
-    THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_, 
-    THCTensor *gradMean_, THCTensor *gradStd_, int train);
-
-void Encoding_(Sum_Square_Forward)(THCState *state, 
-    THCTensor *input_, THCTensor *sum_, THCTensor *square_);
-
-void Encoding_(Sum_Square_Backward)(THCState *state, 
-    THCTensor *gradInput, THCTensor *input_, 
-    THCTensor *gradSum_, THCTensor *gradSquare_);
-
-#endif
--- a/encoding/kernel/include/README.rst
+++ b/encoding/kernel/include/README.rst
-Make a copy from PyTorch lib to make the compilation easier for users, due to so many questions and requests.
--- a/encoding/kernel/include/THCDeviceTensor-inl.cuh
+++ b/encoding/kernel/include/THCDeviceTensor-inl.cuh
-#include <assert.h>
-
-namespace detail {
-
-template <typename T, int N>
-__host__ __device__ void copy(T to[N], T from[N]) {
-  for (int i = 0; i < N; ++i) {
-    to[i] = from[i];
-  }
-}
-
-} // namespace detail
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor()
-    : data_(NULL) {
-  thc_static_assert(Dim > 0);
-
-  for (int i = 0; i < Dim; ++i) {
-    size_[i] = 0;
-    stride_[i] = (IndexT) 1;
-  }
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::
-#ifdef _MSC_VER
-THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim])
-#else
-THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim])
-#endif
-    : data_(data) {
-  thc_static_assert(Dim > 0);
-
-  for (int i = 0; i < Dim; ++i) {
-    size_[i] = sizes[i];
-  }
-
-  stride_[Dim - 1] = (IndexT) 1;
-  for (int i = Dim - 2; i >= 0; --i) {
-    stride_[i] = stride_[i + 1] * sizes[i + 1];
-  }
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor(
-#ifdef _MSC_VER
-  DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim])
-#else
-  DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
-#endif
-    : data_(data) {
-  thc_static_assert(Dim > 0);
-
-  for (int i = 0; i < Dim; ++i) {
-    size_[i] = sizes[i];
-    stride_[i] = strides[i];
-  }
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int OtherDim>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isSameSizeAndStride(
-  const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const {
-  if (Dim != OtherDim) {
-    return false;
-  }
-
-  for (int i = 0; i < Dim; ++i) {
-    if (size_[i] != rhs.size_[i]) {
-      return false;
-    }
-
-    if (stride_[i] != rhs.stride_[i]) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <typename U>
-__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() {
-  thc_static_assert(sizeof(U) == sizeof(T));
-
-  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
-    reinterpret_cast<U*>(data_), size_, stride_);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <typename U>
-__host__ __device__ const THCDeviceTensor<U, Dim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
-  thc_static_assert(sizeof(U) == sizeof(T));
-
-  return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
-    reinterpret_cast<U*>(data_), size_, stride_);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ ptrdiff_t
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
-  ptrdiff_t size = getSize(0);
-
-  for (int i = 1; i < Dim; ++i) {
-    size *= getSize(i);
-  }
-
-  return size;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguous() const {
-  return isContiguousRange(0, Dim);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized(int i) const {
-  if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
-    return true;
-  } else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
-             ((getStride(i - 1) / getStride(i)) >= getSize(i))) {
-    return true;
-  }
-
-  return false;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized() const {
-  for (int i = 0; i < Dim; ++i) {
-    if (!isConsistentlySized(i)) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ bool
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguousRange(
-  int first, int last) const {
-
-  int64_t prevSize = last < Dim ? getStride(last) * getSize(last) : 1;
-
-  for (int i = last - 1; i >= first; --i) {
-    if (getSize(i) != (IndexT) 1) {
-      if (getStride(i) == prevSize) {
-        prevSize *= getSize(i);
-      } else {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::transpose(int dim1,
-                                                      int dim2) const {
-#ifdef __CUDA_ARCH__
-  // Device code
-  assert(dim1 >= 0 && dim1 < Dim);
-  assert(dim1 >= 0 && dim2 < Dim);
-#else
-  // Host code
-  if (dim1 < 0 || dim1 >= Dim) {
-    THError("dim1 out of bounds");
-  }
-
-  if (dim2 < 0 || dim2 >= Dim) {
-    THError("dim2 out of bounds");
-  }
-#endif
-
-  IndexT newSize[Dim];
-  IndexT newStride[Dim];
-
-  for (int i = 0; i < Dim; ++i) {
-    newSize[i] = size_[i];
-    newStride[i] = stride_[i];
-  }
-
-  IndexT tmp = newSize[dim1];
-  newSize[dim1] = newSize[dim2];
-  newSize[dim2] = tmp;
-
-  tmp = newStride[dim1];
-  newStride[dim1] = newStride[dim2];
-  newStride[dim2] = tmp;
-
-  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastOuter() {
-  // Can only create tensors of greater dimension
-  thc_static_assert(NewDim > Dim);
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  int shift = NewDim - Dim;
-
-  for (int i = 0; i < NewDim; ++i) {
-    if (i < shift) {
-      // These are the extended dimensions
-      newSize[i] = (IndexT) 1;
-      newStride[i] = size_[0] * stride_[0];
-    } else {
-      // Shift the remaining dimensions
-      newSize[i] = size_[i - shift];
-      newStride[i] = stride_[i - shift];
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastInner() {
-  // Can only create tensors of greater dimension
-  thc_static_assert(NewDim > Dim);
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  for (int i = 0; i < NewDim; ++i) {
-    if (i < Dim) {
-      // Existing dimensions get copied over
-      newSize[i] = size_[i];
-      newStride[i] = stride_[i];
-    } else {
-      // Extended dimensions
-      newSize[i] = (IndexT) 1;
-      newStride[i] = (IndexT) 1;
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastOuter() {
-  // Can only create tensors of lesser dimension
-  thc_static_assert(NewDim < Dim);
-
-  // We can't downcast non-contiguous tensors, since it leaves
-  // garbage data in the tensor. The tensor needs to be contiguous
-  // in all of the dimensions we are collapsing (no padding in
-  // them).
-  bool cont = isContiguousRange(0, Dim - NewDim);
-#ifdef __CUDA_ARCH__
-  // Device code
-  assert(cont);
-#else
-  // Host code
-  if (!cont) {
-    THError("Can only downcast contiguous tensors");
-  }
-#endif
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  int ignoredDims = Dim - NewDim;
-  IndexT collapsedSize = 1;
-
-  for (int i = 0; i < Dim; ++i) {
-    if (i < ignoredDims) {
-      // Collapse these dimensions
-      collapsedSize *= getSize(i);
-    } else {
-      // Non-collapsed dimensions
-      if (i == ignoredDims) {
-        // This is the first non-collapsed dimension
-        newSize[i - ignoredDims] = collapsedSize * getSize(i);
-      } else {
-        // Subsequent non-collapsed dimensions
-        newSize[i - ignoredDims] = getSize(i);
-      }
-
-      newStride[i - ignoredDims] = getStride(i);
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int NewDim>
-__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastInner() {
-  // Can only create tensors of lesser dimension
-  thc_static_assert(NewDim < Dim);
-
-  // We can't downcast non-contiguous tensors, since it leaves
-  // garbage data in the tensor. The tensor needs to be contiguous
-  // in all of the dimensions we are collapsing (no padding in
-  // them).
-  bool cont = isContiguousRange(NewDim, Dim);
-#ifdef __CUDA_ARCH__
-  // Device code
-  assert(cont);
-#else
-  // Host code
-  if (!cont) {
-    THError("Can only downcast contiguous tensors");
-  }
-#endif
-
-  IndexT newSize[NewDim];
-  IndexT newStride[NewDim];
-
-  IndexT collapsedSize = 1;
-
-  for (int i = Dim - 1; i >= 0; --i) {
-    if (i >= NewDim) {
-      // Collapse these dimensions
-      collapsedSize *= getSize(i);
-    } else {
-      // Non-collapsed dimensions
-      if (i == NewDim - 1) {
-        // This is the first non-collapsed dimension
-        newSize[i] = collapsedSize * getSize(i);
-        newStride[i] = getStride(Dim - 1);
-      } else {
-        // Subsequent non-collapsed dimensions
-        newSize[i] = getSize(i);
-        newStride[i] = getStride(i);
-      }
-    }
-  }
-
-  return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
-    data_, newSize, newStride);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int SubDim>
-__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view(DataPtrType at) {
-  thc_static_assert(SubDim >= 1 && SubDim < Dim);
-
-  IndexT viewSizes[SubDim];
-  IndexT viewStrides[SubDim];
-
-  for (int i = 0; i < SubDim; ++i) {
-    viewSizes[i] = size_[Dim - SubDim + i];
-    viewStrides[i] = stride_[Dim - SubDim + i];
-  }
-
-  return THCDeviceTensor<T, SubDim, IndexT, PtrTraits>(
-    at, viewSizes, viewStrides);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-template <int SubDim>
-__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view() {
-  return view<SubDim>(data_);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-void
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::zero(cudaStream_t stream) {
-#ifdef __CUDA_ARCH__
-  assert(isContiguous());
-#else
-  if (!isContiguous()) {
-    THError("fillAsync only works on contiguous data");
-  }
-#endif
-
-  cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream);
-}
--- a/encoding/kernel/include/THCDeviceTensor.cuh
+++ b/encoding/kernel/include/THCDeviceTensor.cuh
-#ifndef THC_DEVICE_TENSOR_INC
-#define THC_DEVICE_TENSOR_INC
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
-template <bool>
-struct THCStaticAssert;
-
-template <>
-struct THCStaticAssert<true> {
-};
-
-#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())
-
-/// Our tensor type
-template <typename T,
-          int Dim,
-          typename IndexT,
-          template <typename U> class PtrTraits>
-class THCDeviceTensor;
-
-/// Type of a subspace of a tensor
-namespace detail {
-template <typename TensorType,
-          int SubDim,
-          template <typename U> class PtrTraits>
-class THCDeviceSubTensor;
-}
-
-template <typename T>
-struct RestrictPtrTraits {
-  typedef T* __restrict__ PtrType;
-};
-
-template <typename T>
-struct DefaultPtrTraits {
-  typedef T* PtrType;
-};
-
-/**
-   Templated multi-dimensional array that supports strided access of
-   elements. Main access is through `operator[]`; e.g.,
-   `tensor[x][y][z]`.
-
- `T` is the contained type (e.g., `float`)
- `Dim` is the tensor rank
- `IndexT` is the integer type used for size/stride arrays, and for
- all indexing math. Default is `int`, but for large tensors, `int64_t`
- can be used instead.
- `PtrTraits` are traits applied to our data pointer (T*). By default,
- this is just T*, but RestrictPtrTraits can be used to apply T*
- __restrict__ for alias-free analysis.
-*/
-template <typename T,
-          int Dim,
-          typename IndexT = int,
-          template <typename U> class PtrTraits = DefaultPtrTraits>
-class THCDeviceTensor {
- public:
-  enum { NumDim = Dim };
-  typedef T DataType;
-  typedef IndexT IndexType;
-  typedef typename PtrTraits<T>::PtrType DataPtrType;
-  typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;
-
-  /// Default constructor
-  __host__ __device__ THCDeviceTensor();
-
-  /// Constructor that calculates strides with no padding
-  __host__ __device__ THCDeviceTensor(DataPtrType data,
-#ifdef _MSC_VER
-                                      const IndexT (&sizes)[Dim]);
-#else
-                                      const IndexT sizes[Dim]);
-#endif
-
-  /// Constructor that takes arbitrary size/stride arrays
-  __host__ __device__ THCDeviceTensor(DataPtrType data,
-#ifdef _MSC_VER
-                                      const IndexT (&sizes)[Dim],
-                                      const IndexT (&strides)[Dim]);
-#else
-                                      const IndexT sizes[Dim],
-                                      const IndexT strides[Dim]);
-#endif
-
-  /// Returns true if the two tensors are of the same dimensionality,
-  /// size and stride.
-  template <int OtherDim>
-  __host__ __device__ bool
-  isSameSizeAndStride(
-    const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;
-
-  /// Cast to a tensor of a different type of the same size and stride
-  template <typename U>
-  __host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();
-
-  /// Const version of `cast`
-  template <typename U>
-  __host__ __device__
-  const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;
-
-  /// Returns a raw pointer to the start of our data.
-  __host__ __device__ __forceinline__ DataPtrType data() {
-    return data_;
-  }
-
-  /// Returns a raw pointer to the start of our data (const).
-  __host__ __device__ __forceinline__
-  const DataPtrType data() const {
-    return data_;
-  }
-
-  /// Cast to a different datatype
-  template <typename U>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<U>::PtrType dataAs() {
-    return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
-  }
-
-  /// Cast to a different datatype
-  template <typename U>
-  __host__ __device__ __forceinline__
-  const typename PtrTraits<const U>::PtrType dataAs() const {
-    return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
-  }
-
-  /// Returns a read/write view of a portion of our tensor.
-  __host__ __device__ __forceinline__
-  detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
-    operator[](IndexT);
-
-  /// Returns a read/write view of a portion of our tensor (const).
-  __host__ __device__ __forceinline__
-  const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
-    operator[](IndexT) const;
-
-  /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
-  /// checking.
-  __host__ __device__ __forceinline__ int getSize(int i) const {
-    return size_[i];
-  }
-
-  /// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
-  /// checking.
-  __host__ __device__ __forceinline__ int getStride(int i) const {
-    return stride_[i];
-  }
-
-  /// Returns the total number of elements contained within our data
-  /// (product of `getSize(i)`)
-  __host__ __device__ ptrdiff_t numElements() const;
-
-  /// Returns the size array.
-  __host__ __device__ __forceinline__ const IndexT* sizes() const {
-    return size_;
-  }
-
-  /// Returns the stride array.
-  __host__ __device__ __forceinline__ const IndexT* strides() const {
-    return stride_;
-  }
-
-  /// Returns true if there is no padding within the tensor and no
-  /// re-ordering of the dimensions.
-  /// ~~~
-  /// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
-  /// ~~~
-  __host__ __device__ bool isContiguous() const;
-
-  /// Returns whether a given dimension has only increasing stride
-  /// from the previous dimension. A tensor that was permuted by
-  /// exchanging size and stride only will fail this check.
-  /// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
-  __host__ __device__ bool isConsistentlySized(int i) const;
-
-  // Returns whether at each dimension `stride <= size`.
-  // If this is not the case then iterating once over the size space will
-  // touch the same memory locations multiple times.
-  __host__ __device__ bool isConsistentlySized() const;
-
-  /// Returns true if the given dimension range [first, last) has no padding.
-  __host__ __device__ bool isContiguousRange(int first, int last) const;
-
-  /// Returns a tensor of the same dimension after transposing the two
-  /// dimensions given. Does not actually move elements; transposition
-  /// is made by permuting the size/stride arrays.
-  /// If the dimensions are not valid, asserts.
-  __host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-  transpose(int dim1, int dim2) const;
-
-  /// Upcast a tensor of dimension `D` to some tensor of dimension
-  /// D' > D by padding the leading dimensions by 1
-  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
-  template <int NewDim>
-  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  upcastOuter();
-
-  /// Upcast a tensor of dimension `D` to some tensor of dimension
-  /// D' > D by padding the lowest/most varying dimensions by 1
-  /// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
-  template <int NewDim>
-  __host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  upcastInner();
-
-  /// Downcast a tensor of dimension `D` to some tensor of dimension
-  /// D' < D by collapsing the leading dimensions. asserts if there is
-  /// padding on the leading dimensions.
-  template <int NewDim>
-  __host__ __device__
-  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();
-
-  /// Downcast a tensor of dimension `D` to some tensor of dimension
-  /// D' < D by collapsing the leading dimensions. asserts if there is
-  /// padding on the leading dimensions.
-  template <int NewDim>
-  __host__ __device__
-  THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();
-
-  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
-  /// of this tensor, starting at `at`.
-  template <int SubDim>
-  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-  view(DataPtrType at);
-
-  /// Returns a tensor that is a view of the `SubDim`-dimensional slice
-  /// of this tensor, starting where our data begins
-  template <int SubDim>
-  __host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
-  view();
-
-  /// Zeroes out the tensor asynchronously. Asserts if the contents
-  /// in question are not contiguous.
-  void zero(cudaStream_t stream = 0);
-
- private:
-  /// Raw pointer to where the tensor data begins
-  DataPtrType data_;
-
-  /// Array of strides (in sizeof(T) terms) per each dimension
-  IndexT stride_[Dim];
-
-  /// Size per each dimension
-  IndexT size_[Dim];
-};
-
-namespace detail {
-
-/// Specialization for a view of a single value (0-dimensional)
-template <typename TensorType, template <typename U> class PtrTraits>
-class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
- public:
-  __host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
-  operator=(typename TensorType::DataType val) {
-    *data_ = val;
-    return *this;
-  }
-
-  // operator T&
-  __host__ __device__ operator typename TensorType::DataType&() {
-    return *data_;
-  }
-
-  // const operator T& returning const T&
-  __host__ __device__ operator const typename TensorType::DataType&() const {
-    return *data_;
-  }
-
-  // operator& returning T*
-  __host__ __device__ typename TensorType::DataType* operator&() {
-    return data_;
-  }
-
-  // const operator& returning const T*
-  __host__ __device__ const typename TensorType::DataType* operator&() const {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice.
-  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice (const).
-  __host__ __device__ __forceinline__
-  const typename TensorType::DataPtrType data() const {
-    return data_;
-  }
-
-  /// Cast to a different datatype.
-  template <typename T>
-  __host__ __device__ T& as() {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype (const).
-  template <typename T>
-  __host__ __device__ const T& as() const {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<T>::PtrType dataAs() {
-    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
-  }
-
-  /// Cast to a different datatype (const)
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<const T>::PtrType dataAs() const {
-    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
-  }
-
-  /// Use the texture cache for reads
-  __device__ __forceinline__ typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(data_);
-#else
-    return *data_;
-#endif
-  }
-
-  /// Use the texture cache for reads; cast as a particular type
-  template <typename T>
-  __device__ __forceinline__ T ldgAs() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(dataAs<T>());
-#else
-    return as<T>();
-#endif
-  }
-
-  private:
-  /// One dimension greater can create us
-  friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;
-
-  /// Our parent tensor can create us
-  friend class THCDeviceTensor<typename TensorType::DataType,
-                               1,
-                               typename TensorType::IndexType,
-                               PtrTraits>;
-
-  __host__ __device__ __forceinline__ THCDeviceSubTensor(
-    TensorType& t,
-    typename TensorType::DataPtrType data)
-      : tensor_(t),
-        data_(data) {
-  }
-
-  /// The tensor we're referencing
-  TensorType& tensor_;
-
-  /// Where our value is located
-  typename TensorType::DataPtrType const data_;
-};
-
-/// A `SubDim`-rank slice of a parent THCDeviceTensor
-template <typename TensorType,
-          int SubDim,
-          template <typename U> class PtrTraits>
-class THCDeviceSubTensor {
- public:
-  /// Returns a view of the data located at our offset (the dimension
-  /// `SubDim` - 1 tensor).
-  __host__ __device__ __forceinline__
-  THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
-    operator[](typename TensorType::IndexType index) {
-    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
-      tensor_,
-      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
-  }
-
-  /// Returns a view of the data located at our offset (the dimension
-  /// `SubDim` - 1 tensor) (const).
-  __host__ __device__ __forceinline__
-  const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
-    operator[](typename TensorType::IndexType index) const {
-    return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
-      tensor_,
-      data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
-  }
-
-  // operator& returning T*
-  __host__ __device__ typename TensorType::DataType* operator&() {
-    return data_;
-  }
-
-  // const operator& returning const T*
-  __host__ __device__ const typename TensorType::DataType* operator&() const {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice.
-  __host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
-    return data_;
-  }
-
-  /// Returns a raw accessor to our slice (const).
-  __host__ __device__ __forceinline__
-  const typename TensorType::DataPtrType data() const {
-    return data_;
-  }
-
-  /// Cast to a different datatype.
-  template <typename T>
-  __host__ __device__ T& as() {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype (const).
-  template <typename T>
-  __host__ __device__ const T& as() const {
-    return *dataAs<T>();
-  }
-
-  /// Cast to a different datatype
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<T>::PtrType dataAs() {
-    return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
-  }
-
-  /// Cast to a different datatype (const)
-  template <typename T>
-  __host__ __device__ __forceinline__
-  typename PtrTraits<const T>::PtrType dataAs() const {
-    return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
-  }
-
-  /// Use the texture cache for reads
-  __device__ __forceinline__ typename TensorType::DataType ldg() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(data_);
-#else
-    return *data_;
-#endif
-  }
-
-  /// Use the texture cache for reads; cast as a particular type
-  template <typename T>
-  __device__ __forceinline__ T ldgAs() const {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(dataAs<T>());
-#else
-    return as<T>();
-#endif
-  }
-
-  /// Returns a tensor that is a view of the SubDim-dimensional slice
-  /// of this tensor, starting where our data begins
-  THCDeviceTensor<typename TensorType::DataType,
-               SubDim,
-               typename TensorType::IndexType,
-               PtrTraits> view() {
-    return tensor_.template view<SubDim>(data_);
-  }
-
- private:
-  /// One dimension greater can create us
-  friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;
-
-  /// Our parent tensor can create us
-  friend class
-  THCDeviceTensor<typename TensorType::DataType,
-               TensorType::NumDim,
-               typename TensorType::IndexType,
-               PtrTraits>;
-
-  __host__ __device__ __forceinline__ THCDeviceSubTensor(
-    TensorType& t,
-    typename TensorType::DataPtrType data)
-      : tensor_(t),
-        data_(data) {
-  }
-
-  /// The tensor we're referencing
-  TensorType& tensor_;
-
-  /// The start of our sub-region
-  typename TensorType::DataPtrType const data_;
-};
-
-} // namespace detail
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ __forceinline__
-detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
-                        Dim - 1, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
-  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
-    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
-      *this, data_)[index]);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-__host__ __device__ __forceinline__
-const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
-                              Dim - 1, PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
-  return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
-    detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
-      const_cast<TensorType&>(*this), data_)[index]);
-}
-
-#include "THCDeviceTensor-inl.cuh"
-
-#endif // THC_DEVICE_TENSOR_INC
--- a/encoding/kernel/include/THCDeviceTensorUtils-inl.cuh
+++ b/encoding/kernel/include/THCDeviceTensorUtils-inl.cuh
-namespace detail {
-
-// Add a layer of SFINAE to support static_assert
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct UpcastTHCRoot {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t);
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct UpcastTHC :
-      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
-};
-
-// Never instantiated SFINAE purposes only
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
-      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
-      UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t) {
-    thc_static_assert(NewDim > Dim);
-    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
-      template upcastOuter<NewDim>();
-  }
-};
-
-// Add a layer of SFINAE to support static_assert
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct DowncastTHCRoot {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t);
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim, bool B>
-struct DowncastTHC :
-      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
-};
-
-// Never instantiated SFINAE purposes only
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
-      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
-};
-
-template <typename T, int Dim, typename IndexT,
-          template <typename U> class PtrTraits,
-          int NewDim>
-struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
-      DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true>  {
-  static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-  make(THCState* state, THCudaTensor* t) {
-    thc_static_assert(NewDim < Dim);
-    return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
-      template downcastOuter<NewDim>();
-  }
-};
-
-} // namespace detail
-
-#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i)                              \
-  case i:                                                               \
-  if (NewDim > i) {                                                     \
-    return detail::UpcastTHC<T, i, IndexT,                              \
-                             PtrTraits, NewDim, (NewDim > i)>::         \
-      make(state, t);                                                   \
-  } else if (NewDim == i) {                                             \
-    return toDeviceTensor<T, NewDim, IndexT, PtrTraits>(state, t);      \
-  } else {                                                              \
-    return detail::DowncastTHC<T, i, IndexT,                            \
-                               PtrTraits, NewDim, (NewDim < i)>::       \
-      make(state, t);                                                   \
-  }                                                                     \
-  /* break; */
-
-template <typename T, int NewDim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t) {
-  switch (THCudaTensor_nDimension(state, t)) {
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(1);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(2);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(3);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(4);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(5);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(6);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(7);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(8);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(9);
-    SWITCH_UNROLL_CUDA_CAST_FACTORY(10);
-    default:
-      ;
-  }
-
-  // Not implemented
-  THError("THCDeviceTensor dimension size not supported");
-  return NULL; /* never enters this piece, appeasing compiler warnings */
-}
-
-#undef SWITCH_UNROLL_CUDA_CAST_FACTORY
--- a/encoding/kernel/include/THCDeviceTensorUtils.cu
+++ b/encoding/kernel/include/THCDeviceTensorUtils.cu
-#ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
-#else
-
-/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
-/// error if the dimensionality does not match exactly.
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t);
-
-template <typename T, int Dim, typename IndexT>
-THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim>
-THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensor(THCState* state, THCTensor* t) {
-  if (Dim != THCTensor_(nDimension)(state, t)) {
-    THError("THCudaTensor dimension mismatch");
-  }
-  // Determine the maximum offset into the tensor achievable; `IndexT`
-  // must be smaller than this type in order to use it.
-  ptrdiff_t maxOffset = 0;
-  IndexT sizes[Dim];
-  IndexT strides[Dim];
-
-  for (int i = 0; i < Dim; ++i) {
-    int64_t size = THCTensor_(size)(state, t, i);
-    int64_t stride = THCTensor_(stride)(state, t, i);
-
-    maxOffset += (size - 1) * stride;
-
-    sizes[i] = (IndexT) size;
-    strides[i] = (IndexT) stride;
-  }
-
-  if (maxOffset > std::numeric_limits<IndexT>::max()) {
-    THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
-  }
-
-  return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
-    THCTensor_(data)(state, t), sizes, strides);
-}
-
-#endif
--- a/encoding/kernel/include/THCDeviceTensorUtils.cuh
+++ b/encoding/kernel/include/THCDeviceTensorUtils.cuh
-#ifndef THC_DEVICE_TENSOR_UTILS_INC
-#define THC_DEVICE_TENSOR_UTILS_INC
-
-#include "THCDeviceTensor.cuh"
-#include "THCTensor.h"
-#include <limits>
-
-/// Constructs a DeviceTensor initialized from a THCudaTensor by
-/// upcasting or downcasting the tensor to that of a different
-/// dimension.
-template <typename T, int Dim,
-          typename IndexT, template <typename U> class PtrTraits>
-THCDeviceTensor<T, Dim, IndexT, PtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t);
-
-template <typename T, int Dim, typename IndexT>
-THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t) {
-  return toDeviceTensorCast<T, Dim, IndexT, DefaultPtrTraits>(state, t);
-}
-
-template <typename T, int Dim>
-THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
-toDeviceTensorCast(THCState* state, THCudaTensor* t) {
-  return toDeviceTensorCast<T, Dim, int, DefaultPtrTraits>(state, t);
-}
-
-#include "generic/THCDeviceTensorUtils.cu"
-#include "THCGenerateAllTypes.h"
-
-#include "THCDeviceTensorUtils-inl.cuh"
-
-#endif // THC_DEVICE_TENSOR_UTILS_INC
--- a/encoding/kernel/include/generic/THCDeviceTensorUtils.cu
+++ b/encoding/kernel/include/generic/THCDeviceTensorUtils.cu
--- a/encoding/kernel/thc_encoding.cu
+++ b/encoding/kernel/thc_encoding.cu
--- a/encoding/kernel/thc_encoding.h
+++ b/encoding/kernel/thc_encoding.h
--- a/encoding/make.sh
+++ b/encoding/make.sh
-#!/usr/bin/env bash
-mkdir -p encoding/lib && cd encoding/lib
-# compile and install
-cmake ..
-make
--- a/encoding/models/__init__.py
+++ b/encoding/models/__init__.py
--- a/encoding/models/base.py
+++ b/encoding/models/base.py
--- a/encoding/models/encnet.py
+++ b/encoding/models/encnet.py
--- a/encoding/models/fcn.py
+++ b/encoding/models/fcn.py
--- a/encoding/models/model_store.py
+++ b/encoding/models/model_store.py
--- a/encoding/models/model_zoo.py
+++ b/encoding/models/model_zoo.py
--- a/encoding/nn/customize.py
+++ b/encoding/nn/customize.py
--- a/encoding/nn/encoding.py
+++ b/encoding/nn/encoding.py
--- a/encoding/nn/syncbn.py
+++ b/encoding/nn/syncbn.py