Commit 2d21747a authored by Zhang's avatar Zhang
Browse files

v0.4.2

parent 7e19143c
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/syncbn_kernel.h"
#else
void Encoding_(BatchNorm_Forward)(THCState *state,
THCTensor *output_, THCTensor *input_,
THCTensor *mean_, THCTensor *invstd_,
THCTensor *gamma_, THCTensor *beta_);
void Encoding_(BatchNorm_Backward)(THCState *state,
THCTensor *gradoutput_, THCTensor *input_, THCTensor *gradinput_,
THCTensor *gradgamma_, THCTensor *gradbeta_, THCTensor *mean_,
THCTensor *invstd_, THCTensor *gamma_, THCTensor *beta_,
THCTensor *gradMean_, THCTensor *gradStd_, int train);
void Encoding_(Sum_Square_Forward)(THCState *state,
THCTensor *input_, THCTensor *sum_, THCTensor *square_);
void Encoding_(Sum_Square_Backward)(THCState *state,
THCTensor *gradInput, THCTensor *input_,
THCTensor *gradSum_, THCTensor *gradSquare_);
#endif
Make a copy from PyTorch lib to make the compilation easier for users, due to so many questions and requests.
#include <assert.h>
namespace detail {
template <typename T, int N>
__host__ __device__ void copy(T to[N], T from[N]) {
for (int i = 0; i < N; ++i) {
to[i] = from[i];
}
}
} // namespace detail
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor()
: data_(NULL) {
thc_static_assert(Dim > 0);
for (int i = 0; i < Dim; ++i) {
size_[i] = 0;
stride_[i] = (IndexT) 1;
}
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::
#ifdef _MSC_VER
THCDeviceTensor(DataPtrType data, const IndexT (&sizes)[Dim])
#else
THCDeviceTensor(DataPtrType data, const IndexT sizes[Dim])
#endif
: data_(data) {
thc_static_assert(Dim > 0);
for (int i = 0; i < Dim; ++i) {
size_[i] = sizes[i];
}
stride_[Dim - 1] = (IndexT) 1;
for (int i = Dim - 2; i >= 0; --i) {
stride_[i] = stride_[i + 1] * sizes[i + 1];
}
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::THCDeviceTensor(
#ifdef _MSC_VER
DataPtrType data, const IndexT (&sizes)[Dim], const IndexT (&strides)[Dim])
#else
DataPtrType data, const IndexT sizes[Dim], const IndexT strides[Dim])
#endif
: data_(data) {
thc_static_assert(Dim > 0);
for (int i = 0; i < Dim; ++i) {
size_[i] = sizes[i];
stride_[i] = strides[i];
}
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int OtherDim>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isSameSizeAndStride(
const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const {
if (Dim != OtherDim) {
return false;
}
for (int i = 0; i < Dim; ++i) {
if (size_[i] != rhs.size_[i]) {
return false;
}
if (stride_[i] != rhs.stride_[i]) {
return false;
}
}
return true;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() {
thc_static_assert(sizeof(U) == sizeof(T));
return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <typename U>
__host__ __device__ const THCDeviceTensor<U, Dim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::cast() const {
thc_static_assert(sizeof(U) == sizeof(T));
return THCDeviceTensor<U, Dim, IndexT, PtrTraits>(
reinterpret_cast<U*>(data_), size_, stride_);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ ptrdiff_t
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::numElements() const {
ptrdiff_t size = getSize(0);
for (int i = 1; i < Dim; ++i) {
size *= getSize(i);
}
return size;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguous() const {
return isContiguousRange(0, Dim);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized(int i) const {
if (i == 0 && getStride(i) > 0 && getSize(i) > 0) {
return true;
} else if ((i > 0) && (i < Dim) && (getStride(i) > 0) &&
((getStride(i - 1) / getStride(i)) >= getSize(i))) {
return true;
}
return false;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isConsistentlySized() const {
for (int i = 0; i < Dim; ++i) {
if (!isConsistentlySized(i)) {
return false;
}
}
return true;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ bool
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::isContiguousRange(
int first, int last) const {
int64_t prevSize = last < Dim ? getStride(last) * getSize(last) : 1;
for (int i = last - 1; i >= first; --i) {
if (getSize(i) != (IndexT) 1) {
if (getStride(i) == prevSize) {
prevSize *= getSize(i);
} else {
return false;
}
}
}
return true;
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::transpose(int dim1,
int dim2) const {
#ifdef __CUDA_ARCH__
// Device code
assert(dim1 >= 0 && dim1 < Dim);
assert(dim1 >= 0 && dim2 < Dim);
#else
// Host code
if (dim1 < 0 || dim1 >= Dim) {
THError("dim1 out of bounds");
}
if (dim2 < 0 || dim2 >= Dim) {
THError("dim2 out of bounds");
}
#endif
IndexT newSize[Dim];
IndexT newStride[Dim];
for (int i = 0; i < Dim; ++i) {
newSize[i] = size_[i];
newStride[i] = stride_[i];
}
IndexT tmp = newSize[dim1];
newSize[dim1] = newSize[dim2];
newSize[dim2] = tmp;
tmp = newStride[dim1];
newStride[dim1] = newStride[dim2];
newStride[dim2] = tmp;
return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastOuter() {
// Can only create tensors of greater dimension
thc_static_assert(NewDim > Dim);
IndexT newSize[NewDim];
IndexT newStride[NewDim];
int shift = NewDim - Dim;
for (int i = 0; i < NewDim; ++i) {
if (i < shift) {
// These are the extended dimensions
newSize[i] = (IndexT) 1;
newStride[i] = size_[0] * stride_[0];
} else {
// Shift the remaining dimensions
newSize[i] = size_[i - shift];
newStride[i] = stride_[i - shift];
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::upcastInner() {
// Can only create tensors of greater dimension
thc_static_assert(NewDim > Dim);
IndexT newSize[NewDim];
IndexT newStride[NewDim];
for (int i = 0; i < NewDim; ++i) {
if (i < Dim) {
// Existing dimensions get copied over
newSize[i] = size_[i];
newStride[i] = stride_[i];
} else {
// Extended dimensions
newSize[i] = (IndexT) 1;
newStride[i] = (IndexT) 1;
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastOuter() {
// Can only create tensors of lesser dimension
thc_static_assert(NewDim < Dim);
// We can't downcast non-contiguous tensors, since it leaves
// garbage data in the tensor. The tensor needs to be contiguous
// in all of the dimensions we are collapsing (no padding in
// them).
bool cont = isContiguousRange(0, Dim - NewDim);
#ifdef __CUDA_ARCH__
// Device code
assert(cont);
#else
// Host code
if (!cont) {
THError("Can only downcast contiguous tensors");
}
#endif
IndexT newSize[NewDim];
IndexT newStride[NewDim];
int ignoredDims = Dim - NewDim;
IndexT collapsedSize = 1;
for (int i = 0; i < Dim; ++i) {
if (i < ignoredDims) {
// Collapse these dimensions
collapsedSize *= getSize(i);
} else {
// Non-collapsed dimensions
if (i == ignoredDims) {
// This is the first non-collapsed dimension
newSize[i - ignoredDims] = collapsedSize * getSize(i);
} else {
// Subsequent non-collapsed dimensions
newSize[i - ignoredDims] = getSize(i);
}
newStride[i - ignoredDims] = getStride(i);
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::downcastInner() {
// Can only create tensors of lesser dimension
thc_static_assert(NewDim < Dim);
// We can't downcast non-contiguous tensors, since it leaves
// garbage data in the tensor. The tensor needs to be contiguous
// in all of the dimensions we are collapsing (no padding in
// them).
bool cont = isContiguousRange(NewDim, Dim);
#ifdef __CUDA_ARCH__
// Device code
assert(cont);
#else
// Host code
if (!cont) {
THError("Can only downcast contiguous tensors");
}
#endif
IndexT newSize[NewDim];
IndexT newStride[NewDim];
IndexT collapsedSize = 1;
for (int i = Dim - 1; i >= 0; --i) {
if (i >= NewDim) {
// Collapse these dimensions
collapsedSize *= getSize(i);
} else {
// Non-collapsed dimensions
if (i == NewDim - 1) {
// This is the first non-collapsed dimension
newSize[i] = collapsedSize * getSize(i);
newStride[i] = getStride(Dim - 1);
} else {
// Subsequent non-collapsed dimensions
newSize[i] = getSize(i);
newStride[i] = getStride(i);
}
}
}
return THCDeviceTensor<T, NewDim, IndexT, PtrTraits>(
data_, newSize, newStride);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view(DataPtrType at) {
thc_static_assert(SubDim >= 1 && SubDim < Dim);
IndexT viewSizes[SubDim];
IndexT viewStrides[SubDim];
for (int i = 0; i < SubDim; ++i) {
viewSizes[i] = size_[Dim - SubDim + i];
viewStrides[i] = stride_[Dim - SubDim + i];
}
return THCDeviceTensor<T, SubDim, IndexT, PtrTraits>(
at, viewSizes, viewStrides);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::view() {
return view<SubDim>(data_);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
void
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::zero(cudaStream_t stream) {
#ifdef __CUDA_ARCH__
assert(isContiguous());
#else
if (!isContiguous()) {
THError("fillAsync only works on contiguous data");
}
#endif
cudaMemsetAsync(data(), 0, numElements() * sizeof(T), stream);
}
#ifndef THC_DEVICE_TENSOR_INC
#define THC_DEVICE_TENSOR_INC
#include <cuda.h>
#include <cuda_runtime.h>
// A CUDA 6.5 compatible version of static_assert. Remove once on CUDA 7.0.
template <bool>
struct THCStaticAssert;
template <>
struct THCStaticAssert<true> {
};
#define thc_static_assert(expr) (THCStaticAssert<(expr) != 0>())
/// Our tensor type
template <typename T,
int Dim,
typename IndexT,
template <typename U> class PtrTraits>
class THCDeviceTensor;
/// Type of a subspace of a tensor
namespace detail {
template <typename TensorType,
int SubDim,
template <typename U> class PtrTraits>
class THCDeviceSubTensor;
}
template <typename T>
struct RestrictPtrTraits {
typedef T* __restrict__ PtrType;
};
template <typename T>
struct DefaultPtrTraits {
typedef T* PtrType;
};
/**
Templated multi-dimensional array that supports strided access of
elements. Main access is through `operator[]`; e.g.,
`tensor[x][y][z]`.
- `T` is the contained type (e.g., `float`)
- `Dim` is the tensor rank
- `IndexT` is the integer type used for size/stride arrays, and for
- all indexing math. Default is `int`, but for large tensors, `int64_t`
- can be used instead.
- `PtrTraits` are traits applied to our data pointer (T*). By default,
- this is just T*, but RestrictPtrTraits can be used to apply T*
- __restrict__ for alias-free analysis.
*/
template <typename T,
int Dim,
typename IndexT = int,
template <typename U> class PtrTraits = DefaultPtrTraits>
class THCDeviceTensor {
public:
enum { NumDim = Dim };
typedef T DataType;
typedef IndexT IndexType;
typedef typename PtrTraits<T>::PtrType DataPtrType;
typedef THCDeviceTensor<T, Dim, IndexT, PtrTraits> TensorType;
/// Default constructor
__host__ __device__ THCDeviceTensor();
/// Constructor that calculates strides with no padding
__host__ __device__ THCDeviceTensor(DataPtrType data,
#ifdef _MSC_VER
const IndexT (&sizes)[Dim]);
#else
const IndexT sizes[Dim]);
#endif
/// Constructor that takes arbitrary size/stride arrays
__host__ __device__ THCDeviceTensor(DataPtrType data,
#ifdef _MSC_VER
const IndexT (&sizes)[Dim],
const IndexT (&strides)[Dim]);
#else
const IndexT sizes[Dim],
const IndexT strides[Dim]);
#endif
/// Returns true if the two tensors are of the same dimensionality,
/// size and stride.
template <int OtherDim>
__host__ __device__ bool
isSameSizeAndStride(
const THCDeviceTensor<T, OtherDim, IndexT, PtrTraits>& rhs) const;
/// Cast to a tensor of a different type of the same size and stride
template <typename U>
__host__ __device__ THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast();
/// Const version of `cast`
template <typename U>
__host__ __device__
const THCDeviceTensor<U, Dim, IndexT, PtrTraits> cast() const;
/// Returns a raw pointer to the start of our data.
__host__ __device__ __forceinline__ DataPtrType data() {
return data_;
}
/// Returns a raw pointer to the start of our data (const).
__host__ __device__ __forceinline__
const DataPtrType data() const {
return data_;
}
/// Cast to a different datatype
template <typename U>
__host__ __device__ __forceinline__
typename PtrTraits<U>::PtrType dataAs() {
return reinterpret_cast<typename PtrTraits<U>::PtrType>(data_);
}
/// Cast to a different datatype
template <typename U>
__host__ __device__ __forceinline__
const typename PtrTraits<const U>::PtrType dataAs() const {
return reinterpret_cast<typename PtrTraits<const U>::PtrType>(data_);
}
/// Returns a read/write view of a portion of our tensor.
__host__ __device__ __forceinline__
detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
operator[](IndexT);
/// Returns a read/write view of a portion of our tensor (const).
__host__ __device__ __forceinline__
const detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>
operator[](IndexT) const;
/// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
/// checking.
__host__ __device__ __forceinline__ int getSize(int i) const {
return size_[i];
}
/// Returns the stride of a given dimension, `[0, Dim - 1]`. No bounds
/// checking.
__host__ __device__ __forceinline__ int getStride(int i) const {
return stride_[i];
}
/// Returns the total number of elements contained within our data
/// (product of `getSize(i)`)
__host__ __device__ ptrdiff_t numElements() const;
/// Returns the size array.
__host__ __device__ __forceinline__ const IndexT* sizes() const {
return size_;
}
/// Returns the stride array.
__host__ __device__ __forceinline__ const IndexT* strides() const {
return stride_;
}
/// Returns true if there is no padding within the tensor and no
/// re-ordering of the dimensions.
/// ~~~
/// (stride(i) == size(i + 1) * stride(i + 1)) && stride(dim - 1) == 0
/// ~~~
__host__ __device__ bool isContiguous() const;
/// Returns whether a given dimension has only increasing stride
/// from the previous dimension. A tensor that was permuted by
/// exchanging size and stride only will fail this check.
/// If `i == 0` just check `size > 0`. Returns `false` if `stride` is `<= 0`.
__host__ __device__ bool isConsistentlySized(int i) const;
// Returns whether at each dimension `stride <= size`.
// If this is not the case then iterating once over the size space will
// touch the same memory locations multiple times.
__host__ __device__ bool isConsistentlySized() const;
/// Returns true if the given dimension range [first, last) has no padding.
__host__ __device__ bool isContiguousRange(int first, int last) const;
/// Returns a tensor of the same dimension after transposing the two
/// dimensions given. Does not actually move elements; transposition
/// is made by permuting the size/stride arrays.
/// If the dimensions are not valid, asserts.
__host__ __device__ THCDeviceTensor<T, Dim, IndexT, PtrTraits>
transpose(int dim1, int dim2) const;
/// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the leading dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[1][1][2][3]`
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
upcastOuter();
/// Upcast a tensor of dimension `D` to some tensor of dimension
/// D' > D by padding the lowest/most varying dimensions by 1
/// e.g., upcasting a 2-d tensor `[2][3]` to a 4-d tensor `[2][3][1][1]`
template <int NewDim>
__host__ __device__ THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
upcastInner();
/// Downcast a tensor of dimension `D` to some tensor of dimension
/// D' < D by collapsing the leading dimensions. asserts if there is
/// padding on the leading dimensions.
template <int NewDim>
__host__ __device__
THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastOuter();
/// Downcast a tensor of dimension `D` to some tensor of dimension
/// D' < D by collapsing the leading dimensions. asserts if there is
/// padding on the leading dimensions.
template <int NewDim>
__host__ __device__
THCDeviceTensor<T, NewDim, IndexT, PtrTraits> downcastInner();
/// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting at `at`.
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
view(DataPtrType at);
/// Returns a tensor that is a view of the `SubDim`-dimensional slice
/// of this tensor, starting where our data begins
template <int SubDim>
__host__ __device__ THCDeviceTensor<T, SubDim, IndexT, PtrTraits>
view();
/// Zeroes out the tensor asynchronously. Asserts if the contents
/// in question are not contiguous.
void zero(cudaStream_t stream = 0);
private:
/// Raw pointer to where the tensor data begins
DataPtrType data_;
/// Array of strides (in sizeof(T) terms) per each dimension
IndexT stride_[Dim];
/// Size per each dimension
IndexT size_[Dim];
};
namespace detail {
/// Specialization for a view of a single value (0-dimensional)
template <typename TensorType, template <typename U> class PtrTraits>
class THCDeviceSubTensor<TensorType, 0, PtrTraits> {
public:
__host__ __device__ THCDeviceSubTensor<TensorType, 0, PtrTraits>
operator=(typename TensorType::DataType val) {
*data_ = val;
return *this;
}
// operator T&
__host__ __device__ operator typename TensorType::DataType&() {
return *data_;
}
// const operator T& returning const T&
__host__ __device__ operator const typename TensorType::DataType&() const {
return *data_;
}
// operator& returning T*
__host__ __device__ typename TensorType::DataType* operator&() {
return data_;
}
// const operator& returning const T*
__host__ __device__ const typename TensorType::DataType* operator&() const {
return data_;
}
/// Returns a raw accessor to our slice.
__host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
return data_;
}
/// Returns a raw accessor to our slice (const).
__host__ __device__ __forceinline__
const typename TensorType::DataPtrType data() const {
return data_;
}
/// Cast to a different datatype.
template <typename T>
__host__ __device__ T& as() {
return *dataAs<T>();
}
/// Cast to a different datatype (const).
template <typename T>
__host__ __device__ const T& as() const {
return *dataAs<T>();
}
/// Cast to a different datatype
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<T>::PtrType dataAs() {
return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
}
/// Cast to a different datatype (const)
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<const T>::PtrType dataAs() const {
return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
}
/// Use the texture cache for reads
__device__ __forceinline__ typename TensorType::DataType ldg() const {
#if __CUDA_ARCH__ >= 350
return __ldg(data_);
#else
return *data_;
#endif
}
/// Use the texture cache for reads; cast as a particular type
template <typename T>
__device__ __forceinline__ T ldgAs() const {
#if __CUDA_ARCH__ >= 350
return __ldg(dataAs<T>());
#else
return as<T>();
#endif
}
private:
/// One dimension greater can create us
friend class THCDeviceSubTensor<TensorType, 1, PtrTraits>;
/// Our parent tensor can create us
friend class THCDeviceTensor<typename TensorType::DataType,
1,
typename TensorType::IndexType,
PtrTraits>;
__host__ __device__ __forceinline__ THCDeviceSubTensor(
TensorType& t,
typename TensorType::DataPtrType data)
: tensor_(t),
data_(data) {
}
/// The tensor we're referencing
TensorType& tensor_;
/// Where our value is located
typename TensorType::DataPtrType const data_;
};
/// A `SubDim`-rank slice of a parent THCDeviceTensor
template <typename TensorType,
int SubDim,
template <typename U> class PtrTraits>
class THCDeviceSubTensor {
public:
/// Returns a view of the data located at our offset (the dimension
/// `SubDim` - 1 tensor).
__host__ __device__ __forceinline__
THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) {
return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_,
data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
}
/// Returns a view of the data located at our offset (the dimension
/// `SubDim` - 1 tensor) (const).
__host__ __device__ __forceinline__
const THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>
operator[](typename TensorType::IndexType index) const {
return THCDeviceSubTensor<TensorType, SubDim - 1, PtrTraits>(
tensor_,
data_ + index * tensor_.getStride(TensorType::NumDim - SubDim));
}
// operator& returning T*
__host__ __device__ typename TensorType::DataType* operator&() {
return data_;
}
// const operator& returning const T*
__host__ __device__ const typename TensorType::DataType* operator&() const {
return data_;
}
/// Returns a raw accessor to our slice.
__host__ __device__ __forceinline__ typename TensorType::DataPtrType data() {
return data_;
}
/// Returns a raw accessor to our slice (const).
__host__ __device__ __forceinline__
const typename TensorType::DataPtrType data() const {
return data_;
}
/// Cast to a different datatype.
template <typename T>
__host__ __device__ T& as() {
return *dataAs<T>();
}
/// Cast to a different datatype (const).
template <typename T>
__host__ __device__ const T& as() const {
return *dataAs<T>();
}
/// Cast to a different datatype
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<T>::PtrType dataAs() {
return reinterpret_cast<typename PtrTraits<T>::PtrType>(data_);
}
/// Cast to a different datatype (const)
template <typename T>
__host__ __device__ __forceinline__
typename PtrTraits<const T>::PtrType dataAs() const {
return reinterpret_cast<typename PtrTraits<const T>::PtrType>(data_);
}
/// Use the texture cache for reads
__device__ __forceinline__ typename TensorType::DataType ldg() const {
#if __CUDA_ARCH__ >= 350
return __ldg(data_);
#else
return *data_;
#endif
}
/// Use the texture cache for reads; cast as a particular type
template <typename T>
__device__ __forceinline__ T ldgAs() const {
#if __CUDA_ARCH__ >= 350
return __ldg(dataAs<T>());
#else
return as<T>();
#endif
}
/// Returns a tensor that is a view of the SubDim-dimensional slice
/// of this tensor, starting where our data begins
THCDeviceTensor<typename TensorType::DataType,
SubDim,
typename TensorType::IndexType,
PtrTraits> view() {
return tensor_.template view<SubDim>(data_);
}
private:
/// One dimension greater can create us
friend class THCDeviceSubTensor<TensorType, SubDim + 1, PtrTraits>;
/// Our parent tensor can create us
friend class
THCDeviceTensor<typename TensorType::DataType,
TensorType::NumDim,
typename TensorType::IndexType,
PtrTraits>;
__host__ __device__ __forceinline__ THCDeviceSubTensor(
TensorType& t,
typename TensorType::DataPtrType data)
: tensor_(t),
data_(data) {
}
/// The tensor we're referencing
TensorType& tensor_;
/// The start of our sub-region
typename TensorType::DataPtrType const data_;
};
} // namespace detail
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __forceinline__
detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
Dim - 1, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) {
return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
*this, data_)[index]);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
__host__ __device__ __forceinline__
const detail::THCDeviceSubTensor<THCDeviceTensor<T, Dim, IndexT, PtrTraits>,
Dim - 1, PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>::operator[](IndexT index) const {
return detail::THCDeviceSubTensor<TensorType, Dim - 1, PtrTraits>(
detail::THCDeviceSubTensor<TensorType, Dim, PtrTraits>(
const_cast<TensorType&>(*this), data_)[index]);
}
#include "THCDeviceTensor-inl.cuh"
#endif // THC_DEVICE_TENSOR_INC
namespace detail {
// Add a layer of SFINAE to support static_assert
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct UpcastTHCRoot {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t);
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct UpcastTHC :
UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
};
// Never instantiated SFINAE purposes only
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct UpcastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
UpcastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true> {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t) {
thc_static_assert(NewDim > Dim);
return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
template upcastOuter<NewDim>();
}
};
// Add a layer of SFINAE to support static_assert
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct DowncastTHCRoot {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t);
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim, bool B>
struct DowncastTHC :
DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, B> {
};
// Never instantiated SFINAE purposes only
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, false> :
DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, false> {
};
template <typename T, int Dim, typename IndexT,
template <typename U> class PtrTraits,
int NewDim>
struct DowncastTHC<T, Dim, IndexT, PtrTraits, NewDim, true> :
DowncastTHCRoot<T, Dim, IndexT, PtrTraits, NewDim, true> {
static THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
make(THCState* state, THCudaTensor* t) {
thc_static_assert(NewDim < Dim);
return toDeviceTensor<T, Dim, IndexT, PtrTraits>(state, t).
template downcastOuter<NewDim>();
}
};
} // namespace detail
#define SWITCH_UNROLL_CUDA_CAST_FACTORY(i) \
case i: \
if (NewDim > i) { \
return detail::UpcastTHC<T, i, IndexT, \
PtrTraits, NewDim, (NewDim > i)>:: \
make(state, t); \
} else if (NewDim == i) { \
return toDeviceTensor<T, NewDim, IndexT, PtrTraits>(state, t); \
} else { \
return detail::DowncastTHC<T, i, IndexT, \
PtrTraits, NewDim, (NewDim < i)>:: \
make(state, t); \
} \
/* break; */
template <typename T, int NewDim,
typename IndexT, template <typename U> class PtrTraits>
THCDeviceTensor<T, NewDim, IndexT, PtrTraits>
toDeviceTensorCast(THCState* state, THCudaTensor* t) {
switch (THCudaTensor_nDimension(state, t)) {
SWITCH_UNROLL_CUDA_CAST_FACTORY(1);
SWITCH_UNROLL_CUDA_CAST_FACTORY(2);
SWITCH_UNROLL_CUDA_CAST_FACTORY(3);
SWITCH_UNROLL_CUDA_CAST_FACTORY(4);
SWITCH_UNROLL_CUDA_CAST_FACTORY(5);
SWITCH_UNROLL_CUDA_CAST_FACTORY(6);
SWITCH_UNROLL_CUDA_CAST_FACTORY(7);
SWITCH_UNROLL_CUDA_CAST_FACTORY(8);
SWITCH_UNROLL_CUDA_CAST_FACTORY(9);
SWITCH_UNROLL_CUDA_CAST_FACTORY(10);
default:
;
}
// Not implemented
THError("THCDeviceTensor dimension size not supported");
return NULL; /* never enters this piece, appeasing compiler warnings */
}
#undef SWITCH_UNROLL_CUDA_CAST_FACTORY
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
#else
/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
/// error if the dimensionality does not match exactly.
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>
toDeviceTensor(THCState* state, THCTensor* t);
template <typename T, int Dim, typename IndexT>
THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
toDeviceTensor(THCState* state, THCTensor* t) {
return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
}
template <typename T, int Dim>
THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
toDeviceTensor(THCState* state, THCTensor* t) {
return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>
toDeviceTensor(THCState* state, THCTensor* t) {
if (Dim != THCTensor_(nDimension)(state, t)) {
THError("THCudaTensor dimension mismatch");
}
// Determine the maximum offset into the tensor achievable; `IndexT`
// must be smaller than this type in order to use it.
ptrdiff_t maxOffset = 0;
IndexT sizes[Dim];
IndexT strides[Dim];
for (int i = 0; i < Dim; ++i) {
int64_t size = THCTensor_(size)(state, t, i);
int64_t stride = THCTensor_(stride)(state, t, i);
maxOffset += (size - 1) * stride;
sizes[i] = (IndexT) size;
strides[i] = (IndexT) stride;
}
if (maxOffset > std::numeric_limits<IndexT>::max()) {
THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
}
return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
THCTensor_(data)(state, t), sizes, strides);
}
#endif
#ifndef THC_DEVICE_TENSOR_UTILS_INC
#define THC_DEVICE_TENSOR_UTILS_INC
#include "THCDeviceTensor.cuh"
#include "THCTensor.h"
#include <limits>
/// Constructs a DeviceTensor initialized from a THCudaTensor by
/// upcasting or downcasting the tensor to that of a different
/// dimension.
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>
toDeviceTensorCast(THCState* state, THCudaTensor* t);
template <typename T, int Dim, typename IndexT>
THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
toDeviceTensorCast(THCState* state, THCudaTensor* t) {
return toDeviceTensorCast<T, Dim, IndexT, DefaultPtrTraits>(state, t);
}
template <typename T, int Dim>
THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
toDeviceTensorCast(THCState* state, THCudaTensor* t) {
return toDeviceTensorCast<T, Dim, int, DefaultPtrTraits>(state, t);
}
#include "generic/THCDeviceTensorUtils.cu"
#include "THCGenerateAllTypes.h"
#include "THCDeviceTensorUtils-inl.cuh"
#endif // THC_DEVICE_TENSOR_UTILS_INC
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/THCDeviceTensorUtils.cu"
#else
/// Constructs a THCDeviceTensor initialized from a THCudaTensor. Will
/// error if the dimensionality does not match exactly.
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>
toDeviceTensor(THCState* state, THCTensor* t);
template <typename T, int Dim, typename IndexT>
THCDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>
toDeviceTensor(THCState* state, THCTensor* t) {
return toDeviceTensor<T, Dim, IndexT, DefaultPtrTraits>(state, t);
}
template <typename T, int Dim>
THCDeviceTensor<T, Dim, int, DefaultPtrTraits>
toDeviceTensor(THCState* state, THCTensor* t) {
return toDeviceTensor<T, Dim, int, DefaultPtrTraits>(state, t);
}
template <typename T, int Dim,
typename IndexT, template <typename U> class PtrTraits>
THCDeviceTensor<T, Dim, IndexT, PtrTraits>
toDeviceTensor(THCState* state, THCTensor* t) {
if (Dim != THCTensor_(nDimension)(state, t)) {
THError("THCudaTensor dimension mismatch");
}
// Determine the maximum offset into the tensor achievable; `IndexT`
// must be smaller than this type in order to use it.
ptrdiff_t maxOffset = 0;
IndexT sizes[Dim];
IndexT strides[Dim];
for (int i = 0; i < Dim; ++i) {
int64_t size = THCTensor_(size)(state, t, i);
int64_t stride = THCTensor_(stride)(state, t, i);
maxOffset += (size - 1) * stride;
sizes[i] = (IndexT) size;
strides[i] = (IndexT) stride;
}
if (maxOffset > std::numeric_limits<IndexT>::max()) {
THError("THCudaTensor sizes too large for THCDeviceTensor conversion");
}
return THCDeviceTensor<T, Dim, IndexT, PtrTraits>(
THCTensor_(data)(state, t), sizes, strides);
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#include "thc_encoding.h"
#include "common.h"
#include "generic/device_tensor.h"
#include "THC/THCGenerateFloatType.h"
#include "generic/device_tensor.h"
#include "THC/THCGenerateDoubleType.h"
#ifdef __cplusplus
extern "C" {
#endif
// float
#include "generic/encoding_utils.c"
#include "THC/THCGenerateFloatType.h"
#include "generic/encoding_kernel.c"
#include "THC/THCGenerateFloatType.h"
#include "generic/syncbn_kernel.c"
#include "THC/THCGenerateFloatType.h"
#include "generic/pooling_kernel.c"
#include "THC/THCGenerateFloatType.h"
// double
#include "generic/encoding_utils.c"
#include "THC/THCGenerateDoubleType.h"
#include "generic/encoding_kernel.c"
#include "THC/THCGenerateDoubleType.h"
#include "generic/syncbn_kernel.c"
#include "THC/THCGenerateDoubleType.h"
#include "generic/pooling_kernel.c"
#include "THC/THCGenerateDoubleType.h"
#ifdef __cplusplus
}
#endif
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
* Created by: Hang Zhang
* ECE Department, Rutgers University
* Email: zhang.hang@rutgers.edu
* Copyright (c) 2017
*
* This source code is licensed under the MIT-style license found in the
* LICENSE file in the root directory of this source tree
*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
*/
#include <THC.h>
#include "THCDeviceTensor.cuh"
#include "THCDeviceTensorUtils.cuh"
// this symbol will be resolved automatically from PyTorch libs
extern THCState *state;
#define Encoding_(NAME) TH_CONCAT_4(Encoding_, Real, _, NAME)
#define THCTensor TH_CONCAT_3(TH,CReal,Tensor)
#define THCTensor_(NAME) TH_CONCAT_4(TH,CReal,Tensor_,NAME)
#ifdef __cplusplus
extern "C" {
#endif
// float
#include "generic/encoding_kernel.h"
#include "THC/THCGenerateFloatType.h"
#include "generic/syncbn_kernel.h"
#include "THC/THCGenerateFloatType.h"
#include "generic/pooling_kernel.h"
#include "THC/THCGenerateFloatType.h"
// double
#include "generic/encoding_kernel.h"
#include "THC/THCGenerateDoubleType.h"
#include "generic/syncbn_kernel.h"
#include "THC/THCGenerateDoubleType.h"
#include "generic/pooling_kernel.h"
#include "THC/THCGenerateDoubleType.h"
#ifdef __cplusplus
}
#endif
#!/usr/bin/env bash
mkdir -p encoding/lib && cd encoding/lib
# compile and install
cmake ..
make
from .model_zoo import get_model
from .base import *
from .fcn import *
from .encnet import *
def get_segmentation_model(name, **kwargs):
from .fcn import get_fcn
models = {
'fcn': get_fcn,
'encnet': get_encnet,
}
return models[name.lower()](**kwargs)
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.functional import upsample
from torch.nn.parallel.data_parallel import DataParallel
from torch.nn.parallel.parallel_apply import parallel_apply
from torch.nn.parallel.scatter_gather import scatter
from .. import dilated as resnet
from ..utils import batch_pix_accuracy, batch_intersection_union
up_kwargs = {'mode': 'bilinear', 'align_corners': True}
__all__ = ['BaseNet', 'EvalModule', 'MultiEvalModule']
class BaseNet(nn.Module):
def __init__(self, nclass, backbone, aux, se_loss, dilated=True, norm_layer=None,
mean=[.485, .456, .406], std=[.229, .224, .225]):
super(BaseNet, self).__init__()
self.nclass = nclass
self.aux = aux
self.se_loss = se_loss
self.mean = mean
self.std = std
# copying modules from pretrained models
if backbone == 'resnet50':
self.pretrained = resnet.resnet50(pretrained=True, dilated=dilated, norm_layer=norm_layer)
elif backbone == 'resnet101':
self.pretrained = resnet.resnet101(pretrained=True, dilated=dilated, norm_layer=norm_layer)
elif backbone == 'resnet152':
self.pretrained = resnet.resnet152(pretrained=True, dilated=dilated, norm_layer=norm_layer)
else:
raise RuntimeError('unknown backbone: {}'.format(backbone))
# bilinear upsample options
self._up_kwargs = up_kwargs
def base_forward(self, x):
x = self.pretrained.conv1(x)
x = self.pretrained.bn1(x)
x = self.pretrained.relu(x)
x = self.pretrained.maxpool(x)
c1 = self.pretrained.layer1(x)
c2 = self.pretrained.layer2(c1)
c3 = self.pretrained.layer3(c2)
c4 = self.pretrained.layer4(c3)
return c1, c2, c3, c4
def evaluate(self, x, target=None):
pred = self.forward(x)
if isinstance(pred, (tuple, list)):
pred = pred[0]
if target is None:
return pred
correct, labeled = batch_pix_accuracy(pred.data, target.data)
inter, union = batch_intersection_union(pred.data, target.data, self.nclass)
return correct, labeled, inter, union
class EvalModule(nn.Module):
"""Segmentation Eval Module"""
def __init__(self, module):
super(EvalModule, self).__init__()
self.module = module
def forward(self, *inputs, **kwargs):
return self.module.evaluate(*inputs, **kwargs)
class MultiEvalModule(DataParallel):
"""Multi-size Segmentation Eavluator"""
def __init__(self, module, nclass, device_ids=None,
base_size=520, crop_size=480, flip=True,
scales=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75]):
super(MultiEvalModule, self).__init__(module, device_ids)
self.nclass = nclass
self.base_size = base_size
self.crop_size = crop_size
self.scales = scales
self.flip = flip
def parallel_forward(self, inputs, **kwargs):
"""Multi-GPU Mult-size Evaluation
Args:
inputs: list of Tensors
"""
inputs = [(input.unsqueeze(0).cuda(device),) for input, device in zip(inputs, self.device_ids)]
replicas = self.replicate(self, self.device_ids[:len(inputs)])
kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
if len(inputs) < len(kwargs):
inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
elif len(kwargs) < len(inputs):
kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
outputs = self.parallel_apply(replicas, inputs, kwargs)
return outputs
def forward(self, image):
"""Mult-size Evaluation"""
# only single image is supported for evaluation
batch, _, h, w = image.size()
assert(batch == 1)
stride_rate = 2.0/3.0
crop_size = self.crop_size
stride = int(crop_size * stride_rate)
with torch.cuda.device_of(image):
scores = image.new().resize_(batch,self.nclass,h,w).zero_().cuda()
for scale in self.scales:
long_size = int(math.ceil(self.base_size * scale))
if h > w:
height = long_size
width = int(1.0 * w * long_size / h + 0.5)
short_size = width
else:
width = long_size
height = int(1.0 * h * long_size / w + 0.5)
short_size = height
# resize image to current size
cur_img = resize_image(image, height, width)
if scale <= 1.25 or long_size <= crop_size:# #
pad_img = pad_image(cur_img, self.module.mean,
self.module.std, crop_size)
outputs = self.module_inference(pad_img)
outputs = crop_image(outputs, 0, height, 0, width)
else:
if short_size < crop_size:
# pad if needed
pad_img = pad_image(cur_img, self.module.mean,
self.module.std, crop_size)
else:
pad_img = cur_img
_,_,ph,pw = pad_img.size()
assert(ph >= height and pw >= width)
# grid forward and normalize
h_grids = int(math.ceil(1.0*(ph-crop_size)/stride)) + 1
w_grids = int(math.ceil(1.0*(pw-crop_size)/stride)) + 1
with torch.cuda.device_of(image):
outputs = image.new().resize_(batch,self.nclass,ph,pw).zero_().cuda()
count_norm = image.new().resize_(batch,1,ph,pw).zero_().cuda()
# grid evaluation
for idh in range(h_grids):
for idw in range(w_grids):
h0 = idh * stride
w0 = idw * stride
h1 = min(h0 + crop_size, ph)
w1 = min(w0 + crop_size, pw)
crop_img = crop_image(pad_img, h0, h1, w0, w1)
# pad if needed
pad_crop_img = pad_image(crop_img, self.module.mean,
self.module.std, crop_size)
output = self.module_inference(pad_crop_img)
outputs[:,:,h0:h1,w0:w1] += crop_image(output,
0, h1-h0, 0, w1-w0)
count_norm[:,:,h0:h1,w0:w1] += 1
assert((count_norm==0).sum()==0)
outputs = outputs / count_norm
outputs = outputs[:,:,:height,:width]
score = resize_image(outputs, h, w)
scores += score
return scores
def module_inference(self, image):
output = self.module.evaluate(image)
if self.flip:
fimg = flip_image(image)
foutput = self.module.evaluate(fimg)
output += flip_image(foutput)
return output.exp()
def resize_image(img, h, w, mode='bilinear'):
return F.upsample(img, (h, w), **up_kwargs)
def pad_image(img, mean, std, crop_size):
b,c,h,w = img.size()
assert(c==3)
padh = crop_size - h if h < crop_size else 0
padw = crop_size - w if w < crop_size else 0
pad_values = -np.array(mean) / np.array(std)
img_pad = img.new().resize_(b,c,h+padh,w+padw)
#img_pad = F.pad(img, (0,padw,0,padh))
for i in range(c):
# note that pytorch pad params is in reversed orders
img_pad[:,i,:,:] = F.pad(img[:,i,:,:], (0, padw, 0, padh),
value=pad_values[i])
assert(img_pad.size(2)>=crop_size and img_pad.size(3)>=crop_size)
return img_pad
def crop_image(img, h0, h1, w0, w1):
return img[:,:,h0:h1,w0:w1]
def flip_image(img):
assert(img.dim()==4)
with torch.cuda.device_of(img):
idx = torch.arange(img.size(3)-1, -1, -1).type_as(img).long()
return img.index_select(3, idx)
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.nn.functional import upsample
import encoding
from .base import BaseNet
from .fcn import FCNHead
__all__ = ['EncNet', 'EncModule', 'get_encnet', 'get_encnet_resnet50_pcontext']
class EncNet(BaseNet):
def __init__(self, nclass, backbone, aux=True, se_loss=True,
norm_layer=nn.BatchNorm2d, **kwargs):
super(EncNet, self).__init__(nclass, backbone, aux, se_loss, norm_layer=norm_layer)
self.head = EncHead(self.nclass, in_channels=2048, se_loss=se_loss,
norm_layer=norm_layer, up_kwargs=self._up_kwargs)
if aux:
self.auxlayer = FCNHead(1024, nclass, norm_layer=norm_layer)
def forward(self, x):
imsize = x.size()[2:]
#features = self.base_forward(x)
_, _, c3, c4 = self.base_forward(x)
x = list(self.head(c4))
x[0] = upsample(x[0], imsize, **self._up_kwargs)
if self.aux:
auxout = self.auxlayer(c3)
auxout = upsample(auxout, imsize, **self._up_kwargs)
x.append(auxout)
return tuple(x)
class EncModule(nn.Module):
def __init__(self, in_channels, nclass, ncodes=32, se_loss=True, norm_layer=None):
super(EncModule, self).__init__()
if isinstance(norm_layer, encoding.nn.BatchNorm2d):
norm_layer = encoding.nn.BatchNorm1d
else:
norm_layer = nn.BatchNorm1d
self.se_loss = se_loss
self.encoding = nn.Sequential(
encoding.nn.Encoding(D=in_channels, K=ncodes),
norm_layer(ncodes),
nn.ReLU(inplace=True),
encoding.nn.Sum(dim=1))
self.fc = nn.Sequential(
nn.Linear(in_channels, in_channels),
nn.Sigmoid())
if self.se_loss:
self.selayer = nn.Linear(in_channels, nclass)
def forward(self, x):
en = self.encoding(x)
b, c, _, _ = x.size()
gamma = self.fc(en)
y = gamma.view(b, c, 1, 1)
# residual ?
outputs = [x + x * y]
if self.se_loss:
outputs.append(self.selayer(en))
return tuple(outputs)
class EncHead(nn.Module):
def __init__(self, out_channels, in_channels, se_loss=True,
norm_layer=None, up_kwargs=None):
super(EncHead, self).__init__()
self.conv5 = nn.Sequential(
nn.Conv2d(in_channels, 512, 3, padding=1, bias=False),
norm_layer(512),
nn.ReLU(True))
self.encmodule = EncModule(512, out_channels, ncodes=32,
se_loss=se_loss, norm_layer=norm_layer)
self.dropout = nn.Dropout2d(0.1, False)
self.conv6 = nn.Conv2d(512, out_channels, 1)
self.se_loss = se_loss
def forward(self, x):
x = self.conv5(x)
outs = list(self.encmodule(x))
outs[0] = self.conv6(self.dropout(outs[0]))
return tuple(outs)
def get_encnet(dataset='pascal_voc', backbone='resnet50', pretrained=False,
root='~/.encoding/models', **kwargs):
r"""EncNet model from the paper `"Context Encoding for Semantic Segmentation"
<https://arxiv.org/pdf/1803.08904.pdf>`_
Parameters
----------
dataset : str, default pascal_voc
The dataset that model pretrained on. (pascal_voc, ade20k)
backbone : str, default resnet50
The backbone network. (resnet50, 101, 152)
pretrained : bool, default False
Whether to load the pretrained weights for model.
root : str, default '~/.encoding/models'
Location for keeping the model parameters.
Examples
--------
>>> model = get_encnet(dataset='pascal_voc', backbone='resnet50', pretrained=False)
>>> print(model)
"""
acronyms = {
'pascal_voc': 'voc',
'ade20k': 'ade',
'pcontext': 'pcontext',
}
# infer number of classes
from ..datasets import datasets, VOCSegmentation, VOCAugSegmentation, ADE20KSegmentation
model = EncNet(datasets[dataset.lower()].NUM_CLASS, backbone=backbone, **kwargs)
if pretrained:
from .model_store import get_model_file
model.load_state_dict(torch.load(
get_model_file('encnet_%s_%s'%(backbone, acronyms[dataset]), root=root)))
return model
def get_encnet_resnet50_pcontext(pretrained=False, root='~/.encoding/models', **kwargs):
r"""EncNet-PSP model from the paper `"Context Encoding for Semantic Segmentation"
<https://arxiv.org/pdf/1803.08904.pdf>`_
Parameters
----------
pretrained : bool, default False
Whether to load the pretrained weights for model.
root : str, default '~/.encoding/models'
Location for keeping the model parameters.
Examples
--------
>>> model = get_encnet_resnet50_pcontext(pretrained=True)
>>> print(model)
"""
return get_encnet('pcontext', 'resnet50', pretrained)
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
from __future__ import division
import os
import numpy as np
import torch
import torch.nn as nn
from torch.nn.functional import upsample
from .base import BaseNet
__all__ = ['FCN', 'get_fcn', 'get_fcn_resnet50_pcontext', 'get_fcn_resnet50_ade']
class FCN(BaseNet):
r"""Fully Convolutional Networks for Semantic Segmentation
Parameters
----------
nclass : int
Number of categories for the training dataset.
backbone : string
Pre-trained dilated backbone network type (default:'resnet50'; 'resnet50',
'resnet101' or 'resnet152').
norm_layer : object
Normalization layer used in backbone network (default: :class:`mxnet.gluon.nn.BatchNorm`;
Reference:
Long, Jonathan, Evan Shelhamer, and Trevor Darrell. "Fully convolutional networks
for semantic segmentation." *CVPR*, 2015
Examples
--------
>>> model = FCN(nclass=21, backbone='resnet50')
>>> print(model)
"""
def __init__(self, nclass, backbone, aux=True, se_loss=False, norm_layer=nn.BatchNorm2d, **kwargs):
super(FCN, self).__init__(nclass, backbone, aux, se_loss, norm_layer=norm_layer)
self.head = FCNHead(2048, nclass, norm_layer)
if aux:
self.auxlayer = FCNHead(1024, nclass, norm_layer)
def forward(self, x):
imsize = x.size()[2:]
_, _, c3, c4 = self.base_forward(x)
x = self.head(c4)
x = upsample(x, imsize, **self._up_kwargs)
outputs = [x]
if self.aux:
auxout = self.auxlayer(c3)
auxout = upsample(auxout, imsize, **self._up_kwargs)
outputs.append(auxout)
return tuple(outputs)
class FCNHead(nn.Module):
def __init__(self, in_channels, out_channels, norm_layer):
super(FCNHead, self).__init__()
inter_channels = in_channels // 4
self.conv5 = nn.Sequential(nn.Conv2d(in_channels, inter_channels, 3, padding=1),
norm_layer(inter_channels),
nn.ReLU(),
nn.Dropout2d(0.1, False),
nn.Conv2d(inter_channels, out_channels, 1))
def forward(self, x):
return self.conv5(x)
def get_fcn(dataset='pascal_voc', backbone='resnet50', pretrained=False,
root='~/.encoding/models', **kwargs):
r"""FCN model from the paper `"Fully Convolutional Network for semantic segmentation"
<https://people.eecs.berkeley.edu/~jonlong/long_shelhamer_fcn.pdf>`_
Parameters
----------
dataset : str, default pascal_voc
The dataset that model pretrained on. (pascal_voc, ade20k)
pretrained : bool, default False
Whether to load the pretrained weights for model.
root : str, default '~/.encoding/models'
Location for keeping the model parameters.
Examples
--------
>>> model = get_fcn(dataset='pascal_voc', backbone='resnet50', pretrained=False)
>>> print(model)
"""
acronyms = {
'pascal_voc': 'voc',
'pascal_aug': 'voc',
'pcontext': 'pcontext',
'ade20k': 'ade',
}
# infer number of classes
from ..datasets import datasets, VOCSegmentation, VOCAugSegmentation, ADE20KSegmentation
model = FCN(datasets[dataset.lower()].NUM_CLASS, backbone=backbone, **kwargs)
if pretrained:
from .model_store import get_model_file
model.load_state_dict(torch.load(
get_model_file('fcn_%s_%s'%(backbone, acronyms[dataset]), root=root)),
strict= False)
return model
def get_fcn_resnet50_pcontext(pretrained=False, root='~/.encoding/models', **kwargs):
r"""EncNet-PSP model from the paper `"Context Encoding for Semantic Segmentation"
<https://arxiv.org/pdf/1803.08904.pdf>`_
Parameters
----------
pretrained : bool, default False
Whether to load the pretrained weights for model.
root : str, default '~/.encoding/models'
Location for keeping the model parameters.
Examples
--------
>>> model = get_fcn_resnet50_pcontext(pretrained=True)
>>> print(model)
"""
return get_fcn('pcontext', 'resnet50', pretrained)
def get_fcn_resnet50_ade(pretrained=False, root='~/.encoding/models', **kwargs):
r"""EncNet-PSP model from the paper `"Context Encoding for Semantic Segmentation"
<https://arxiv.org/pdf/1803.08904.pdf>`_
Parameters
----------
pretrained : bool, default False
Whether to load the pretrained weights for model.
root : str, default '~/.encoding/models'
Location for keeping the model parameters.
Examples
--------
>>> model = get_fcn_resnet50_ade(pretrained=True)
>>> print(model)
"""
return get_fcn('ade20k', 'resnet50', pretrained)
"""Model store which provides pretrained models."""
from __future__ import print_function
__all__ = ['get_model_file', 'purge']
import os
import zipfile
from ..utils import download, check_sha1
_model_sha1 = {name: checksum for checksum, name in [
('eeed8e582f0fdccdba8579e7490570adc6d85c7c', 'fcn_resnet50_pcontext'),
('969062a5aad2d1d983bae2f9e412578b62610114', 'encnet_resnet50_pcontext'),
('fc8c0b795abf0133700c2d4265d2f9edab7eb6cc', 'fcn_resnet50_ade'),
]}
encoding_repo_url = 'https://hangzh.s3.amazonaws.com/'
_url_format = '{repo_url}encoding/models/{file_name}.zip'
def short_hash(name):
if name not in _model_sha1:
raise ValueError('Pretrained model for {name} is not available.'.format(name=name))
return _model_sha1[name][:8]
def get_model_file(name, root=os.path.join('~', '.encoding', 'models')):
r"""Return location for the pretrained on local file system.
This function will download from online model zoo when model cannot be found or has mismatch.
The root directory will be created if it doesn't exist.
Parameters
----------
name : str
Name of the model.
root : str, default '~/.encoding/models'
Location for keeping the model parameters.
Returns
-------
file_path
Path to the requested pretrained model file.
"""
file_name = '{name}-{short_hash}'.format(name=name, short_hash=short_hash(name))
root = os.path.expanduser(root)
file_path = os.path.join(root, file_name+'.pth')
sha1_hash = _model_sha1[name]
if os.path.exists(file_path):
if check_sha1(file_path, sha1_hash):
return file_path
else:
print('Mismatch in the content of model file detected. Downloading again.')
else:
print('Model file is not found. Downloading.')
if not os.path.exists(root):
os.makedirs(root)
zip_file_path = os.path.join(root, file_name+'.zip')
repo_url = os.environ.get('ENCODING_REPO', encoding_repo_url)
if repo_url[-1] != '/':
repo_url = repo_url + '/'
download(_url_format.format(repo_url=repo_url, file_name=file_name),
path=zip_file_path,
overwrite=True)
with zipfile.ZipFile(zip_file_path) as zf:
zf.extractall(root)
os.remove(zip_file_path)
if check_sha1(file_path, sha1_hash):
return file_path
else:
raise ValueError('Downloaded file has different hash. Please try again.')
def purge(root=os.path.join('~', '.encoding', 'models')):
r"""Purge all pretrained model files in local file store.
Parameters
----------
root : str, default '~/.encoding/models'
Location for keeping the model parameters.
"""
root = os.path.expanduser(root)
files = os.listdir(root)
for f in files:
if f.endswith(".pth"):
os.remove(os.path.join(root, f))
def pretrained_model_list():
return list(_model_sha1.keys())
# pylint: disable=wildcard-import, unused-wildcard-import
from .fcn import *
from .encnet import *
__all__ = ['get_model']
def get_model(name, **kwargs):
"""Returns a pre-defined model by name
Parameters
----------
name : str
Name of the model.
pretrained : bool
Whether to load the pretrained weights for model.
root : str, default '~/.encoding/models'
Location for keeping the model parameters.
Returns
-------
Module:
The model.
"""
models = {
'fcn_resnet50_pcontext': get_fcn_resnet50_pcontext,
'encnet_resnet50_pcontext': get_encnet_resnet50_pcontext,
'fcn_resnet50_ade': get_fcn_resnet50_ade,
}
name = name.lower()
if name not in models:
raise ValueError('%s\n\t%s' % (str(e), '\n\t'.join(sorted(models.keys()))))
net = models[name](**kwargs)
return net
......@@ -11,13 +11,15 @@
"""Encoding Custermized NN Module"""
import torch
from torch.nn import Module, Sequential, Conv2d, ReLU, AdaptiveAvgPool2d, \
NLLLoss, BCELoss, CrossEntropyLoss
NLLLoss, BCELoss, CrossEntropyLoss, AvgPool2d, MaxPool2d, Parameter
from torch.nn import functional as F
from torch.autograd import Variable
from .syncbn import BatchNorm2d
torch_ver = torch.__version__[:3]
__all__ = ['GramMatrix', 'SegmentationLosses', 'View', 'Sum', 'Mean',
'Normalize', 'PyramidPooling']
'Normalize']
class GramMatrix(Module):
......@@ -39,39 +41,51 @@ def softmax_crossentropy(input, target, weight, size_average, ignore_index, redu
class SegmentationLosses(CrossEntropyLoss):
"""2D Cross Entropy Loss with Auxilary Loss"""
def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
def __init__(self, se_loss=False, se_weight=0.1, nclass=-1,
aux=False, aux_weight=0.2, weight=None,
size_average=True, ignore_index=-1):
super(SegmentationLosses, self).__init__(weight, size_average, ignore_index)
self.se_loss = se_loss
self.aux = aux
self.nclass = nclass
self.se_weight = se_weight
self.aux_weight = aux_weight
self.bceloss = BCELoss(weight, size_average)
def forward(self, *inputs):
if not self.aux:
if not self.se_loss and not self.aux:
return super(SegmentationLosses, self).forward(*inputs)
pred1, pred2, target = tuple(inputs)
loss1 = super(SegmentationLosses, self).forward(pred1, target)
loss2 = super(SegmentationLosses, self).forward(pred2, target)
return loss1 + self.aux_weight * loss2
"""
class SegmentationLosses(Module):
def __init__(self, aux, aux_weight=0.2, weight=None, size_average=True, ignore_index=-1):
super(SegmentationLosses, self).__init__()
self.aux = aux
self.aux_weight = aux_weight
# Somehow the size averge is not handled correctly on multi-gpu, so we average by ourself.
self.nll_loss = NLLLoss(weight, ignore_index=ignore_index, reduce=True)
def _forward_each(self, inputs, targets):
return self.nll_loss(F.log_softmax(inputs, dim=1), targets)
def forward(self, *inputs):
if not self.aux:
return self._forward_each(*inputs)
pred1, pred2, target = tuple(inputs)
loss1 = self._forward_each(pred1, target)
loss2 = self._forward_each(pred2, target)
return loss1 + self.aux_weight * loss2
"""
elif not self.se_loss:
pred1, pred2, target = tuple(inputs)
loss1 = super(SegmentationLosses, self).forward(pred1, target)
loss2 = super(SegmentationLosses, self).forward(pred2, target)
return loss1 + self.aux_weight * loss2
elif not self.aux:
pred, se_pred, target = tuple(inputs)
se_target = self._get_batch_label_vector(target, nclass=self.nclass).type_as(pred)
loss1 = super(SegmentationLosses, self).forward(pred, target)
loss2 = self.bceloss(F.sigmoid(se_pred), se_target)
return loss1 + self.se_weight * loss2
else:
pred1, se_pred, pred2, target = tuple(inputs)
se_target = self._get_batch_label_vector(target, nclass=self.nclass).type_as(pred1)
loss1 = super(SegmentationLosses, self).forward(pred1, target)
loss2 = super(SegmentationLosses, self).forward(pred2, target)
loss3 = self.bceloss(F.sigmoid(se_pred), se_target)
return loss1 + self.aux_weight * loss2 + self.se_weight * loss3
@staticmethod
def _get_batch_label_vector(target, nclass):
# target is a 3D Variable BxHxW, output is 2D BxnClass
batch = target.size(0)
tvect = Variable(torch.zeros(batch, nclass))
for i in range(batch):
hist = torch.histc(target[i].cpu().data.float(),
bins=nclass, min=0,
max=nclass-1)
vect = hist>0
tvect[i] = vect
return tvect
class View(Module):
......@@ -135,45 +149,3 @@ class Normalize(Module):
def forward(self, x):
return F.normalize(x, self.p, self.dim, eps=1e-10)
class PyramidPooling(Module):
"""
Reference:
Zhao, Hengshuang, et al. *"Pyramid scene parsing network."*
"""
def __init__(self, in_channels):
super(PyramidPooling, self).__init__()
self.pool1 = AdaptiveAvgPool2d(1)
self.pool2 = AdaptiveAvgPool2d(2)
self.pool3 = AdaptiveAvgPool2d(3)
self.pool4 = AdaptiveAvgPool2d(6)
out_channels = int(in_channels/4)
self.conv1 = Sequential(Conv2d(in_channels, out_channels, 1),
BatchNorm2d(out_channels),
ReLU(True))
self.conv2 = Sequential(Conv2d(in_channels, out_channels, 1),
BatchNorm2d(out_channels),
ReLU(True))
self.conv3 = Sequential(Conv2d(in_channels, out_channels, 1),
BatchNorm2d(out_channels),
ReLU(True))
self.conv4 = Sequential(Conv2d(in_channels, out_channels, 1),
BatchNorm2d(out_channels),
ReLU(True))
def _cat_each(self, x, feat1, feat2, feat3, feat4):
assert(len(x) == len(feat1))
z = []
for i in range(len(x)):
z.append(torch.cat((x[i], feat1[i], feat2[i], feat3[i], feat4[i]), 1))
return z
def forward(self, x):
_, _, h, w = x.size()
feat1 = F.upsample(self.conv1(self.pool1(x)), (h, w), mode='bilinear')
feat2 = F.upsample(self.conv2(self.pool2(x)), (h, w), mode='bilinear')
feat3 = F.upsample(self.conv3(self.pool3(x)), (h, w), mode='bilinear')
feat4 = F.upsample(self.conv4(self.pool4(x)), (h, w), mode='bilinear')
return torch.cat((x, feat1, feat2, feat3, feat4), 1)
......@@ -15,9 +15,9 @@ import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.modules.utils import _pair
from ..functions import scaledL2, aggregate, dilatedavgpool2d
from ..functions import scaledL2, aggregate
__all__ = ['Encoding', 'EncodingDrop', 'Inspiration', 'DilatedAvgPool2d', 'UpsampleConv2d']
__all__ = ['Encoding', 'EncodingDrop', 'Inspiration', 'UpsampleConv2d']
class Encoding(Module):
r"""
......@@ -203,82 +203,6 @@ class Inspiration(Module):
+ 'N x ' + str(self.C) + ')'
class DilatedAvgPool2d(Module):
r"""We provide Dilated Average Pooling for the dilation of Densenet as
in :class:`encoding.dilated.DenseNet`.
Reference:
Hang Zhang, Kristin Dana, Jianping Shi, Zhongyue Zhang, Xiaogang Wang, Ambrish Tyagi,
Amit Agrawal. “Context Encoding for Semantic Segmentation.
*The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2018*
Applies a 2D average pooling over an input signal composed of several input planes.
In the simplest case, the output value of the layer with input size :math:`(N, C, H, W)`,
output :math:`(B, C, H_{out}, W_{out})`, :attr:`kernel_size` :math:`(k_H,k_W)`,
:attr:`stride` :math:`(s_H,s_W)` :attr:`dilation` :math:`(d_H,d_W)`
can be precisely described as:
.. math::
\begin{array}{ll}
out(b, c, h, w) = 1 / (k_H \cdot k_W) \cdot
\sum_{{m}=0}^{k_H-1} \sum_{{n}=0}^{k_W-1}
input(b, c, s_H \cdot h + d_H \cdot m, s_W \cdot w + d_W \cdot n)
\end{array}
| If :attr:`padding` is non-zero, then the input is implicitly zero-padded on both sides
for :attr:`padding` number of points
| The parameters :attr:`kernel_size`, :attr:`stride`, :attr:`padding`,
:attr:`dilation` can either be:
- a single ``int`` -- in which case the same value is used for the height
and width dimension
- a ``tuple`` of two ints -- in which case, the first `int` is used for
the height dimension, and the second `int` for the width dimension
Args:
kernel_size: the size of the window
stride: the stride of the window. Default value is :attr:`kernel_size`
padding: implicit zero padding to be added on both sides
dilation: the dilation parameter similar to Conv2d
Shape:
- Input: :math:`(B, C, H_{in}, W_{in})`
- Output: :math:`(B, C, H_{out}, W_{out})` where
:math:`H_{out} = floor((H_{in} + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
:math:`W_{out} = floor((W_{in} + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
For :attr:`stride=1`, the output featuremap preserves the same size as input.
Examples::
>>> # pool of square window of size=3, stride=2, dilation=2
>>> m = nn.DilatedAvgPool2d(3, stride=2, dilation=2)
>>> input = autograd.Variable(torch.randn(20, 16, 50, 32))
>>> output = m(input)
"""
def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
super(DilatedAvgPool2d, self).__init__()
self.kernel_size = kernel_size
self.stride = stride or kernel_size
self.padding = padding
self.dilation = dilation
def forward(self, input):
return dilatedavgpool2d(input, self.kernel_size, self.stride,
self.padding, self.dilation)
def __repr__(self):
return self.__class__.__name__ + ' (' \
+ 'size=' + str(self.kernel_size) \
+ ', stride=' + str(self.stride) \
+ ', padding=' + str(self.padding) \
+ ', dilation=' + str(self.dilation) + ')'
class UpsampleConv2d(Module):
r"""
To avoid the checkerboard artifacts of standard Fractionally-strided Convolution,
......
......@@ -23,34 +23,28 @@ from ..functions import *
from ..parallel import allreduce
from .comm import SyncMaster
__all__ = ['BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'Module', 'Sequential', 'Conv1d',
'Conv2d', 'ConvTranspose2d', 'ReLU', 'Sigmoid', 'MaxPool2d', 'AvgPool2d',
'AdaptiveAvgPool2d', 'Dropout2d', 'Linear']
# Adapt from https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
_ChildMessage = collections.namedtuple('Message', ['sum', 'ssum', 'sum_size'])
_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])
class _SyncBatchNorm(_BatchNorm):
def __init__(self, num_features, eps=1e-5, momentum=0.001, affine=True):
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True):
super(_SyncBatchNorm, self).__init__(num_features, eps=eps, momentum=momentum, affine=affine)
self._sync_master = SyncMaster(self._data_parallel_master)
self._is_parallel = False
self._parallel_id = None
self._slave_pipe = None
def forward(self, input):
# If it is not parallel computation or is in evaluation mode, use PyTorch's implementation.
if not (self._is_parallel and self.training):
if not self.training:
return batch_norm(
input, self.running_mean, self.running_var, self.weight, self.bias,
self.training, self.momentum, self.eps)
# Resize the input to (B, C, -1).
input_shape = input.size()
input = input.view(input.size(0), self.num_features, -1)
input = input.view(input_shape[0], self.num_features, -1)
# sum(x) and sum(x^2)
N = input.size(0) * input.size(2)
......@@ -62,11 +56,9 @@ class _SyncBatchNorm(_BatchNorm):
else:
mean, inv_std = self._slave_pipe.run_slave(_ChildMessage(xsum, xsqsum, N))
# forward
return batchnormtrain(input, self.weight, self.bias, mean, 1.0/inv_std).view(input_shape)
return batchnormtrain(input, mean, 1.0/inv_std, self.weight, self.bias).view(input_shape)
def __data_parallel_replicate__(self, ctx, copy_id):
self._is_parallel = True
self._parallel_id = copy_id
# parallel_id == 0 means master device.
......@@ -110,7 +102,12 @@ class _SyncBatchNorm(_BatchNorm):
self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean.data
self.running_var = (1 - self.momentum) * self.running_var + self.momentum * unbias_var.data
return mean, bias_var.clamp(self.eps) ** -0.5
return mean, (bias_var + self.eps) ** -0.5
# API adapted from https://github.com/vacancy/Synchronized-BatchNorm-PyTorch
_ChildMessage = collections.namedtuple('Message', ['sum', 'ssum', 'sum_size'])
_MasterMessage = collections.namedtuple('_MasterMessage', ['sum', 'inv_std'])
class BatchNorm1d(_SyncBatchNorm):
......@@ -193,12 +190,11 @@ class BatchNorm3d(_SyncBatchNorm):
class SharedTensor(object):
"""Shared Tensor for cross GPU all reduce operation"""
def __init__(self, nGPUs, op):
def __init__(self, nGPUs):
self.mutex = threading.Lock()
self.all_tasks_done = threading.Condition(self.mutex)
self.nGPUs = nGPUs
self._clear()
self.op = op
def _clear(self):
self.N = 0
......@@ -206,9 +202,7 @@ class SharedTensor(object):
self.push_tasks = self.nGPUs
self.reduce_tasks = self.nGPUs
def __call__(self, *inputs):
if self.nGPUs <= 1:
return tuple(inputs)
def push(self, *inputs):
# push from device
with self.mutex:
if self.push_tasks == 0:
......@@ -223,13 +217,15 @@ class SharedTensor(object):
self.all_tasks_done.notify_all()
while self.push_tasks:
self.all_tasks_done.wait()
def pull(self, igpu):
# pull from device
with self.mutex:
if igpu == 0:
assert(len(self.dict) == self.nGPUs)
# flatten the tensors
self.list = [t for i in range(len(self.dict)) for t in self.dict[i]]
self.outlist = self.op(2, *self.list)
self.outlist = allreduce(2, *self.list)
self.reduce_tasks -= 1
else:
self.reduce_tasks -= 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment