Unverified Commit ed5456d3 authored by Hang Zhang's avatar Hang Zhang Committed by GitHub
Browse files

add lib (#61)

parent d8abf505
......@@ -6,7 +6,5 @@ build/
data/
docs/src/
docs/html/
encoding/lib/
encoding/_ext/
encoding.egg-info/
experiments/segmentation/
import os
import torch
from torch.utils.cpp_extension import load
cwd = os.path.dirname(os.path.realpath(__file__))
cpu_path = os.path.join(cwd, 'cpu')
gpu_path = os.path.join(cwd, 'gpu')
cpu = load( 'enclib_cpu', [
os.path.join(cpu_path, 'roi_align.cpp'),
os.path.join(cpu_path, 'roi_align_cpu.cpp'),
], build_directory=cpu_path, verbose=False)
if torch.cuda.is_available():
gpu = load( 'enclib_gpu', [
os.path.join(gpu_path, 'operator.cpp'),
os.path.join(gpu_path, 'encoding_kernel.cu'),
os.path.join(gpu_path, 'syncbn_kernel.cu'),
os.path.join(gpu_path, 'roi_align_kernel.cu'),
], build_directory=gpu_path, verbose=False)
#include <torch/torch.h>
// CPU declarations
at::Tensor ROIAlignForwardCPU(
const at::Tensor& input,
const at::Tensor& bottom_rois,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sampling_ratio);
at::Tensor ROIAlignBackwardCPU(
const at::Tensor& bottom_rois,
const at::Tensor& grad_output, // gradient of the output of the layer
int64_t b_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sampling_ratio);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("roi_align_forward", &ROIAlignForwardCPU, "ROI Align forward (CPU)");
m.def("roi_align_backward", &ROIAlignBackwardCPU, "ROI Align backward (CPU)");
}
#include <ATen/ATen.h>
//#include <omp.h>
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};
template <typename T>
void pre_calc_for_bilinear_interpolate(
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int iy_upper,
const int ix_upper,
T roi_start_h,
T roi_start_w,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
std::vector<PreCalc<T>>* pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T x = xx;
T y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc->at(pre_calc_index) = pc;
pre_calc_index += 1;
continue;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
int y_low = static_cast<int>(y);
int x_low = static_cast<int>(x);
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
// save weights and indeces
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc->at(pre_calc_index) = pc;
pre_calc_index += 1;
}
}
}
}
}
template <typename T>
void ROIAlignForwardCompute(
const int nthreads,
const T* bottom_data,
const T& spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const T* bottom_rois,
int roi_cols,
T* top_data) {
int n_rois = nthreads / channels / pooled_width / pooled_height;
// (n, c, ph, pw) is an element in the pooled output
// can be parallelized using omp
for (int n = 0; n < n_rois; n++) {
int index_n = n * channels * pooled_width * pooled_height;
// roi could have 4 or 5 columns
const T* offset_bottom_rois = bottom_rois + n * roi_cols;
int roi_batch_ind = 0;
if (roi_cols == 5) {
roi_batch_ind = offset_bottom_rois[0];
offset_bottom_rois++;
}
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_bottom_rois[0] * spatial_scale;
T roi_start_h = offset_bottom_rois[1] * spatial_scale;
T roi_end_w = offset_bottom_rois[2] * spatial_scale;
T roi_end_h = offset_bottom_rois[3] * spatial_scale;
// Force malformed ROIs to be 1x1
T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
// we want to precalculate indeces and weights shared by all chanels,
// this is the key point of optimiation
std::vector<PreCalc<T>> pre_calc(
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
height,
width,
pooled_height,
pooled_width,
roi_bin_grid_h,
roi_bin_grid_w,
roi_start_h,
roi_start_w,
bin_size_h,
bin_size_w,
roi_bin_grid_h,
roi_bin_grid_w,
&pre_calc);
int c;
#pragma omp parallel for private(c) \
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
for (c = 0; c < channels; c++) {
int index_n_c = index_n + c * pooled_width * pooled_height;
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
int index = index_n_c + ph * pooled_width + pw;
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_bottom_data[pc.pos1] +
pc.w2 * offset_bottom_data[pc.pos2] +
pc.w3 * offset_bottom_data[pc.pos3] +
pc.w4 * offset_bottom_data[pc.pos4];
pre_calc_index += 1;
}
}
output_val /= count;
top_data[index] = output_val;
} // for pw
} // for ph
} // for c
} // for n
}
template <typename T>
void bilinear_interpolate_gradient(
const int height,
const int width,
T y,
T x,
T* w1,
T* w2,
T* w3,
T* w4,
int* x_low,
int* x_high,
int* y_low,
int* y_high,
const int /*index*/ /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
*w1 = *w2 = *w3 = *w4 = 0.;
*x_low = *x_high = *y_low = *y_high = -1;
return;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
*y_low = static_cast<int>(y);
*x_low = static_cast<int>(x);
if (*y_low >= height - 1) {
*y_high = *y_low = height - 1;
y = (T)*y_low;
} else {
*y_high = *y_low + 1;
}
if (*x_low >= width - 1) {
*x_high = *x_low = width - 1;
x = (T)*x_low;
} else {
*x_high = *x_low + 1;
}
T ly = y - *y_low;
T lx = x - *x_low;
T hy = 1. - ly, hx = 1. - lx;
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
return;
}
template <class T>
inline void add(const T& val, T* address) {
*address += val;
}
template <typename T>
void ROIAlignBackwardCompute(
const int nthreads,
const T* top_diff,
const int /*num_rois*/,
const T& spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
T* bottom_diff,
const T* bottom_rois,
int rois_cols) {
for (int index = 0; index < nthreads; index++) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * rois_cols;
int roi_batch_ind = 0;
if (rois_cols == 5) {
roi_batch_ind = offset_bottom_rois[0];
offset_bottom_rois++;
}
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_bottom_rois[0] * spatial_scale;
T roi_start_h = offset_bottom_rois[1] * spatial_scale;
T roi_end_w = offset_bottom_rois[2] * spatial_scale;
T roi_end_h = offset_bottom_rois[3] * spatial_scale;
// Force malformed ROIs to be 1x1
T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height,
width,
y,
x,
&w1,
&w2,
&w3,
&w4,
&x_low,
&x_high,
&y_low,
&y_high,
index);
T g1 = top_diff_this_bin * w1 / count;
T g2 = top_diff_this_bin * w2 / count;
T g3 = top_diff_this_bin * w3 / count;
T g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
// atomic add is not needed for now since it is single threaded
add(static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
add(static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
add(static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
add(static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
} // if
} // ix
} // iy
} // for
} // ROIAlignBackward
at::Tensor ROIAlignForwardCPU(
const at::Tensor& input,
const at::Tensor& bottom_rois,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sampling_ratio) {
AT_ASSERT(input.is_contiguous());
AT_ASSERT(bottom_rois.is_contiguous());
AT_ASSERT(input.ndimension() == 4);
AT_ASSERT(bottom_rois.ndimension() == 2);
AT_ASSERT(bottom_rois.size(1) == 5);
// ROIs is the set of region proposals to process. It is a 2D at::Tensor where the first
// dim is the # of proposals, and the second dim is the proposal itself in the form
// [batch_index startW startH endW endH]
auto num_rois = bottom_rois.size(0);
auto roi_cols = bottom_rois.size(1);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
AT_ASSERT(roi_cols == 4 || roi_cols == 5);
// Output at::Tensor is (num_rois, C, pooled_height, pooled_width)
auto output = input.type().tensor({num_rois, channels, pooled_height, pooled_width});
AT_ASSERT(input.is_contiguous());
AT_ASSERT(bottom_rois.is_contiguous());
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlignForwardCPU", ([&] {
ROIAlignForwardCompute<scalar_t>(
output.numel(),
input.data<scalar_t>(),
static_cast<scalar_t>(spatial_scale),
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
bottom_rois.data<scalar_t>(),
roi_cols,
output.data<scalar_t>());
}));
return output;
}
at::Tensor ROIAlignBackwardCPU(
const at::Tensor& bottom_rois,
const at::Tensor& grad_output, // gradient of the output of the layer
int64_t b_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sampling_ratio)
{
AT_ASSERT(bottom_rois.is_contiguous());
AT_ASSERT(bottom_rois.ndimension() == 2);
AT_ASSERT(bottom_rois.size(1) == 5);
auto num_rois = bottom_rois.size(0);
auto roi_cols = bottom_rois.size(1);
AT_ASSERT(roi_cols == 4 || roi_cols == 5);
// Output at::Tensor is (num_rois, C, pooled_height, pooled_width)
auto grad_in = bottom_rois.type().tensor({b_size, channels, height, width}).zero_();
AT_ASSERT(bottom_rois.is_contiguous());
AT_DISPATCH_FLOATING_TYPES(bottom_rois.type(), "ROIAlignBackwardCPU", ([&] {
ROIAlignBackwardCompute<scalar_t>(
grad_output.numel(),
grad_output.data<scalar_t>(),
num_rois,
static_cast<scalar_t>(spatial_scale),
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
grad_in.data<scalar_t>(),
bottom_rois.data<scalar_t>(),
roi_cols);
}));
return grad_in;
}
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CppExtension
setup(
name='enclib_cpu',
ext_modules=[
CppExtension('enclib_cpu', [
'roi_align.cpp',
'roi_align_cpu.cpp',
]),
],
cmdclass={
'build_ext': BuildExtension
})
#include <cuda.h>
#include <cuda_runtime.h>
static const unsigned WARP_SIZE = 32;
// The maximum number of threads in a block
static const unsigned MAX_BLOCK_SIZE = 512U;
template<typename In, typename Out>
struct ScalarConvert {
static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
};
// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
static int getNumThreads(int nElem) {
int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
for (int i = 0; i != 5; ++i) {
if (nElem <= threadSizes[i]) {
return threadSizes[i];
}
}
return MAX_BLOCK_SIZE;
}
// Returns the index of the most significant 1 bit in `val`.
__device__ __forceinline__ int getMSB(int val) {
return 31 - __clz(val);
}
template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}
// Sum across all threads within a warp
template <typename T>
static __device__ __forceinline__ T warpSum(T val) {
#if __CUDA_ARCH__ >= 300
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
val += WARP_SHFL_XOR(val, 1 << i, WARP_SIZE);
}
#else
__shared__ T values[MAX_BLOCK_SIZE];
values[threadIdx.x] = val;
__threadfence_block();
const int base = (threadIdx.x / WARP_SIZE) * WARP_SIZE;
for (int i = 1; i < WARP_SIZE; i++) {
val += values[base + ((i + threadIdx.x) % WARP_SIZE)];
}
#endif
return val;
}
template <typename DType, typename Acctype>
struct Float2 {
Acctype v1, v2;
__device__ Float2() {}
__device__ Float2(DType v1, DType v2) : v1(ScalarConvert<DType, Acctype>::to(v1)), v2(ScalarConvert<DType, Acctype>::to(v2)) {}
__device__ Float2(DType v) : v1(ScalarConvert<DType, Acctype>::to(v)), v2(ScalarConvert<DType, Acctype>::to(v)) {}
__device__ Float2(int v) : v1(ScalarConvert<int, Acctype>::to(v)), v2(ScalarConvert<int, Acctype>::to(v)) {}
__device__ Float2& operator+=(const Float2& a) {
v1 += a.v1;
v2 += a.v2;
return *this;
}
};
template <typename DType, typename Acctype>
static __device__ __forceinline__ Float2<DType, Acctype> warpSum(Float2<DType, Acctype> value) {
value.v1 = warpSum(value.v1);
value.v2 = warpSum(value.v2);
return value;
}
#include <ATen/ATen.h>
template<typename DType, int Dim>
struct DeviceTensor {
public:
inline __device__ __host__ DeviceTensor(DType *p, const int *size)
: dptr_(p) {
for (int i = 0; i < Dim; ++i) {
size_[i] = size ? size[i] : 0;
}
}
inline __device__ __host__ unsigned getSize(const int i) const {
assert(i < Dim);
return size_[i];
}
inline __device__ __host__ int numElements() const {
int n = 1;
for (int i = 0; i < Dim; ++i) {
n *= size_[i];
}
return n;
}
inline __device__ __host__ DeviceTensor<DType, Dim-1> select(const size_t x) const {
assert(Dim > 1);
int offset = x;
for (int i = 1; i < Dim; ++i) {
offset *= size_[i];
}
DeviceTensor<DType, Dim-1> tensor(dptr_ + offset, nullptr);
for (int i = 0; i < Dim - 1; ++i) {
tensor.size_[i] = this->size_[i+1];
}
return tensor;
}
inline __device__ __host__ DeviceTensor<DType, Dim-1> operator[](const size_t x) const {
assert(Dim > 1);
int offset = x;
for (int i = 1; i < Dim; ++i) {
offset *= size_[i];
}
DeviceTensor<DType, Dim-1> tensor(dptr_ + offset, nullptr);
for (int i = 0; i < Dim - 1; ++i) {
tensor.size_[i] = this->size_[i+1];
}
return tensor;
}
inline __device__ __host__ size_t InnerSize() const {
assert(Dim >= 3);
size_t sz = 1;
for (size_t i = 2; i < Dim; ++i) {
sz *= size_[i];
}
return sz;
}
inline __device__ __host__ size_t ChannelCount() const {
assert(Dim >= 3);
return size_[1];
}
inline __device__ __host__ DType* data_ptr() const {
return dptr_;
}
DType *dptr_;
int size_[Dim];
};
template<typename DType>
struct DeviceTensor<DType, 1> {
inline __device__ __host__ DeviceTensor(DType *p, const int *size)
: dptr_(p) {
size_[0] = size ? size[0] : 0;
}
inline __device__ __host__ unsigned getSize(const int i) const {
assert(i == 0);
return size_[0];
}
inline __device__ __host__ int numElements() const {
return size_[0];
}
inline __device__ __host__ DType &operator[](const size_t x) const {
return *(dptr_ + x);
}
inline __device__ __host__ DType* data_ptr() const {
return dptr_;
}
DType *dptr_;
int size_[1];
};
template<typename DType, int Dim>
static DeviceTensor<DType, Dim> devicetensor(const at::Tensor &blob) {
DType *data = blob.data<DType>();
DeviceTensor<DType, Dim> tensor(data, nullptr);
for (int i = 0; i < Dim; ++i) {
tensor.size_[i] = blob.size(i);
}
return tensor;
}
#include <ATen/ATen.h>
#include <vector>
#include "common.h"
#include "device_tensor.h"
namespace {
template<typename DType, typename Acctype>
struct AggOp {
__device__ AggOp(DeviceTensor<DType, 3> a,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c) : A(a), X(x), C(c) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(A[b][i][k] * (X[b][i][d] - C[k][d]));
}
DeviceTensor<DType, 3> A;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
};
template<typename DType, typename Acctype>
struct AggBackOp {
__device__ AggBackOp(DeviceTensor<DType, 3> g,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c) : G(g), X(x), C(c) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(G[b][k][d] * (X[b][i][d] - C[k][d]));
}
DeviceTensor<DType, 3> G;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
};
template<typename DType, typename Acctype>
struct SL2Op {
__device__ SL2Op(DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c) : X(x), C(c) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d)
{
DType r = X[b][i][d] - C[k][d];
return ScalarConvert<DType, Acctype>::to(r * r);
}
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
};
template<typename DType, typename Acctype>
struct SL2GradXOp {
__device__ SL2GradXOp(
DeviceTensor<DType, 3> gsl,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 1> s
) : GSL(gsl), X(x), C(c), S(s) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d)
{
return ScalarConvert<DType, Acctype>::to(
2 * S[k] * GSL[b][i][k] * (X[b][i][d]-C[k][d]));
}
DeviceTensor<DType, 3> GSL;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 1> S;
};
template<typename T, typename Op>
__device__ T reduceN(
Op op, int b, int k, int d, int N) {
T sum = 0;
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(b,x,k,d);
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename T, typename Op>
__device__ T reduceD(
Op op, int b, int i, int k, int D) {
T sum = 0;
for (int x = threadIdx.x; x < D; x += blockDim.x) {
sum += op(b,i,k,x);
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename T, typename Op>
__device__ T reduceK(
Op op, int b, int i, int d, int K) {
T sum = 0;
for (int x = threadIdx.x; x < K; x += blockDim.x) {
sum += op(b,i,x,d);
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename T, typename Op>
__device__ T reduceBN(
Op op,
int k, int d, int B, int N) {
T sum = 0;
for (int batch = 0; batch < B; ++batch) {
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(batch,x,k,d);
}
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename DType, typename Acctype>
__global__ void Aggregate_Forward_kernel (
DeviceTensor<DType, 3> E,
DeviceTensor<DType, 3> A,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C) {
/* declarations of the variables */
int b, k, d, N;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
k = blockIdx.y;
N = X.getSize(1);
/* main operation */
AggOp<DType, Acctype> g(A,X,C);
E[b][k][d] = reduceN<Acctype>(g, b, k, d, N);
}
template<typename DType, typename Acctype>
__global__ void Aggregate_Backward_kernel (
DeviceTensor<DType, 3> GA,
DeviceTensor<DType, 3> GE,
DeviceTensor<DType, 3> A,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C) {
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
k = blockIdx.x;
D = GE.getSize(2);
/* main operation */
AggBackOp<DType, Acctype> g(GE,X,C);
GA[b][i][k] = reduceD<Acctype>(g, b, i, k, D);
}
template<typename DType, typename Acctype>
__global__ void ScaledL2_Forward_kernel (
DeviceTensor<DType, 3> SL,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 1> S) {
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x;
i = blockIdx.y;
D = X.getSize(2);
/* main operation */
SL2Op<DType, Acctype> g(X,C);
SL[b][i][k] = S[k] * reduceD<Acctype>(g,b,i,k,D);;
}
template<typename DType, typename Acctype>
__global__ void ScaledL2_GradX_kernel (
DeviceTensor<DType, 3> GSL,
DeviceTensor<DType, 3> GX,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 1> S) {
/* declarations of the variables */
int b, d, i, K;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
i = blockIdx.y;
K = C.getSize(0);
/* main operation */
SL2GradXOp<DType, Acctype> g(GSL,X,C,S);
GX[b][i][d] = reduceK<Acctype>(g,b,i,d,K);
}
template<typename DType, typename Acctype>
__global__ void ScaledL2_GradC_kernel (
DeviceTensor<DType, 3> GSL,
DeviceTensor<DType, 2> GC,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 1> S) {
/* declarations of the variables */
int k, d, B, N;
/* Get the index and channels */
d = blockIdx.x;
k = blockIdx.y;
B = X.getSize(0);
N = X.getSize(1);
/* main operation */
SL2GradXOp<DType, Acctype> g(GSL,X,C,S);
GC[k][d] = - reduceBN<Acctype>(g, k, d, B, N);
}
}// namespace
at::Tensor Aggregate_Forward_CUDA(
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_) {
/* Device tensors */
auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_();
cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
// B, K, D
dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
dim3 threads(getNumThreads(X_.size(1)));
AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Forward_CUDA", ([&] {
DeviceTensor<scalar_t, 3> E = devicetensor<scalar_t, 3>(E_);
DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
/* kernel function */
Aggregate_Forward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>>(E, A, X, C);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return E_;
}
std::vector<at::Tensor> Aggregate_Backward_CUDA(
const at::Tensor GE_,
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_) {
auto gradA_ = at::zeros_like(A_);
auto gradX_ = at::bmm(A_, GE_);
auto gradC_ = (-GE_ * A_.sum(1).unsqueeze(2)).sum(0);
cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
// B, K, D
dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1)));
AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> GA = devicetensor<scalar_t, 3>(gradA_);
DeviceTensor<scalar_t, 3> GE = devicetensor<scalar_t, 3>(GE_);
DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
Aggregate_Backward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>> (GA, GE, A, X, C);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return {gradA_, gradX_, gradC_};
}
at::Tensor ScaledL2_Forward_CUDA(
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor S_) {
auto SL_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1)));
AT_DISPATCH_FLOATING_TYPES(X_.type(), "ScaledL2_Forward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> SL = devicetensor<scalar_t, 3>(SL_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 1> S = devicetensor<scalar_t, 1>(S_);
/* kernel function */
ScaledL2_Forward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>> (SL, X, C, S);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return SL_;
}
std::vector<at::Tensor> ScaledL2_Backward_CUDA(
const at::Tensor GSL_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor S_,
const at::Tensor SL_) {
auto GX_ = at::zeros_like(X_);
auto GC_ = at::zeros_like(C_);
/* kernel function */
cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
dim3 threads1(getNumThreads(C_.size(0)));
dim3 blocks2(C_.size(1), C_.size(0));
dim3 threads2(getNumThreads(X_.size(1)));
//std::vector<int> size{ 1, 1, K};
//auto GS_ = GSL_ * (SL_ / at::_unsafe_view(S_, size))
auto GS_ = (GSL_ * (SL_ / S_.view({1, 1, C_.size(0)}))).sum(0).sum(0);
AT_DISPATCH_FLOATING_TYPES(X_.type(), "ScaledL2_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> GSL = devicetensor<scalar_t, 3>(GSL_);
DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 1> S = devicetensor<scalar_t, 1>(S_);
ScaledL2_GradX_kernel<scalar_t, scalar_t>
<<<blocks1, threads1, 0, stream>>> (GSL, GX, X, C, S);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
ScaledL2_GradC_kernel<scalar_t, scalar_t>
<<<blocks2, threads2, 0, stream>>> (GSL, GC, X, C, S);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
}));
return {GX_, GC_, GS_};
}
#include "operator.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("roi_align_forward", &ROIAlignForwardCUDA, "ROI Align forward (CUDA)");
m.def("roi_align_backward", &ROIAlignBackwardCUDA, "ROI Align backward (CUDA)");
m.def("aggregate_forward", &Aggregate_Forward_CUDA, "Aggregate forward (CUDA)");
m.def("aggregate_backward", &Aggregate_Backward_CUDA, "Aggregate backward (CUDA)");
m.def("scaled_l2_forward", &ScaledL2_Forward_CUDA, "ScaledL2 forward (CUDA)");
m.def("scaled_l2_backward", &ScaledL2_Backward_CUDA, "ScaledL2 backward (CUDA)");
m.def("batchnorm_forward", &BatchNorm_Forward_CUDA, "BatchNorm forward (CUDA)");
m.def("batchnorm_backward", &BatchNorm_Backward_CUDA, "BatchNorm backward (CUDA)");
m.def("sumsquare_forward", &Sum_Square_Forward_CUDA, "SumSqu forward (CUDA)");
m.def("sumsquare_backward", &Sum_Square_Backward_CUDA, "SumSqu backward (CUDA)");
}
#include <torch/torch.h>
#include <vector>
at::Tensor ROIAlignForwardCUDA(
const at::Tensor input,
const at::Tensor rois,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sample_ratio);
at::Tensor ROIAlignBackwardCUDA(
const at::Tensor rois,
const at::Tensor grad_output,
int64_t b_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sampling_ratio);
at::Tensor Aggregate_Forward_CUDA(
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_);
std::vector<at::Tensor> Aggregate_Backward_CUDA(
const at::Tensor GE_,
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_);
at::Tensor ScaledL2_Forward_CUDA(
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor S_);
std::vector<at::Tensor> ScaledL2_Backward_CUDA(
const at::Tensor GSL_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor S_,
const at::Tensor SL_);
at::Tensor BatchNorm_Forward_CUDA(
const at::Tensor input_,
const at::Tensor mean_,
const at::Tensor std_,
const at::Tensor gamma_,
const at::Tensor beta_);
std::vector<at::Tensor> BatchNorm_Backward_CUDA(
const at::Tensor gradoutput_,
const at::Tensor input_,
const at::Tensor mean_,
const at::Tensor std_,
const at::Tensor gamma_,
const at::Tensor beta_,
bool train);
std::vector<at::Tensor> Sum_Square_Forward_CUDA(
const at::Tensor input_);
at::Tensor Sum_Square_Backward_CUDA(
const at::Tensor input_,
const at::Tensor gradSum_,
const at::Tensor gradSquare_);
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>
namespace {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
// The number of cuda threads to use. 512 is used for backward compatibility
constexpr int ROI_CUDA_NUM_THREADS = 512;
// The maximum number of blocks to use in the default kernel call.
constexpr int ROI_MAXIMUM_NUM_BLOCKS = 4096;
/**
* @brief Compute the number of blocks needed to run N threads.
*/
inline int ROI_GET_BLOCKS(const int N) {
return std::max(
std::min(
(N + ROI_CUDA_NUM_THREADS - 1) / ROI_CUDA_NUM_THREADS,
ROI_MAXIMUM_NUM_BLOCKS),
// Use at least 1 block, since CUDA does not allow empty block
1);
}
template <typename T>
__device__ T bilinear_interpolate(
const T* bottom_data,
const int height,
const int width,
T y,
T x,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
return 0;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
int y_low = static_cast<int>(y);
int x_low = static_cast<int>(x);
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = bottom_data[y_low * width + x_low];
T v2 = bottom_data[y_low * width + x_high];
T v3 = bottom_data[y_high * width + x_low];
T v4 = bottom_data[y_high * width + x_high];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
__global__ void RoIAlignForwardKernel(
const int nthreads,
const T* bottom_data,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const T* bottom_rois,
T* top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_bottom_rois[1] * spatial_scale;
T roi_start_h = offset_bottom_rois[2] * spatial_scale;
T roi_end_w = offset_bottom_rois[3] * spatial_scale;
T roi_end_h = offset_bottom_rois[4] * spatial_scale;
// Force malformed ROIs to be 1x1
T roi_width = max(roi_end_w - roi_start_w, (T)1.);
T roi_height = max(roi_end_h - roi_start_h, (T)1.);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val = bilinear_interpolate(
offset_bottom_data, height, width, y, x, index);
output_val += val;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
template <typename T>
__device__ void bilinear_interpolate_gradient(
const int height,
const int width,
T y,
T x,
T* w1,
T* w2,
T* w3,
T* w4,
int* x_low,
int* x_high,
int* y_low,
int* y_high,
const int /*index*/ /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
*w1 = *w2 = *w3 = *w4 = 0.;
*x_low = *x_high = *y_low = *y_high = -1;
return;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
*y_low = static_cast<int>(y);
*x_low = static_cast<int>(x);
if (*y_low >= height - 1) {
*y_high = *y_low = height - 1;
y = (T)*y_low;
} else {
*y_high = *y_low + 1;
}
if (*x_low >= width - 1) {
*x_high = *x_low = width - 1;
x = (T)*x_low;
} else {
*x_high = *x_low + 1;
}
T ly = y - *y_low;
T lx = x - *x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
*w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
return;
}
template <typename T>
inline __device__ T gpu_atomic_add(const T val, T* address);
template <>
inline __device__ float gpu_atomic_add(const float val, float* address) {
return atomicAdd(address, val);
}
template <>
inline __device__ double gpu_atomic_add(const double val, double* address) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull;
unsigned long long int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return val;
}
template <typename T>
__global__ void RoIAlignBackwardKernel(
const int nthreads,
const T* top_diff,
const int num_rois,
const T spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
T* bottom_diff,
const T* bottom_rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Do not using rounding; this implementation detail is critical
T roi_start_w = offset_bottom_rois[1] * spatial_scale;
T roi_start_h = offset_bottom_rois[2] * spatial_scale;
T roi_end_w = offset_bottom_rois[3] * spatial_scale;
T roi_end_h = offset_bottom_rois[4] * spatial_scale;
// Force malformed ROIs to be 1x1
T roi_width = max(roi_end_w - roi_start_w, (T)1.);
T roi_height = max(roi_end_h - roi_start_h, (T)1.);
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(
height,
width,
y,
x,
&w1,
&w2,
&w3,
&w4,
&x_low,
&x_high,
&y_low,
&y_high,
index);
T g1 = top_diff_this_bin * w1 / count;
T g2 = top_diff_this_bin * w2 / count;
T g3 = top_diff_this_bin * w3 / count;
T g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
/*
atomicAdd(
offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
atomicAdd(
offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
atomicAdd(
offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
atomicAdd(
offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
*/
gpu_atomic_add(
static_cast<T>(g1), offset_bottom_diff + y_low * width + x_low);
gpu_atomic_add(
static_cast<T>(g2), offset_bottom_diff + y_low * width + x_high);
gpu_atomic_add(
static_cast<T>(g3), offset_bottom_diff + y_high * width + x_low);
gpu_atomic_add(
static_cast<T>(g4), offset_bottom_diff + y_high * width + x_high);
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RoIAlignBackward
} // namespace
at::Tensor ROIAlignForwardCUDA(
const at::Tensor input,
const at::Tensor rois,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sampling_ratio) {
AT_ASSERT(input.is_contiguous());
AT_ASSERT(rois.is_contiguous());
AT_ASSERT(input.ndimension() == 4);
AT_ASSERT(rois.ndimension() == 2);
AT_ASSERT(rois.size(1) == 5);
auto proposals = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
// Output Tensor is (num_rois, C, pooled_height, pooled_width)
auto output = input.type().tensor({proposals, channels, pooled_height, pooled_width});
auto count = output.numel();
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlignForwardCUDA", ([&] {
RoIAlignForwardKernel<scalar_t>
<<<ROI_GET_BLOCKS(count),
ROI_CUDA_NUM_THREADS,
0,
at::globalContext().getCurrentCUDAStream()>>>(
count,
input.data<scalar_t>(),
static_cast<scalar_t>(spatial_scale),
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
rois.data<scalar_t>(),
output.data<scalar_t>());
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return output;
}
at::Tensor ROIAlignBackwardCUDA(
const at::Tensor rois,
const at::Tensor grad_output,
int64_t b_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sampling_ratio) {
AT_ASSERT(rois.is_contiguous());
AT_ASSERT(rois.ndimension() == 2);
AT_ASSERT(rois.size(1) == 5);
auto roi_cols = rois.size(1);
AT_ASSERT(roi_cols == 4 || roi_cols == 5);
// Output Tensor is (num_rois, C, pooled_height, pooled_width)
// gradient wrt input features
auto grad_in = rois.type().tensor({b_size, channels, height, width}).zero_();
auto num_rois = rois.size(0);
auto count = grad_output.numel();
AT_DISPATCH_FLOATING_TYPES(rois.type(), "ROIAlignBackwardCUDA", ([&] {
RoIAlignBackwardKernel<scalar_t>
<<<ROI_GET_BLOCKS(count),
ROI_CUDA_NUM_THREADS,
0,
at::globalContext().getCurrentCUDAStream()>>>(
count,
grad_output.data<scalar_t>(),
num_rois,
static_cast<scalar_t>(spatial_scale),
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
grad_in.data<scalar_t>(),
rois.data<scalar_t>());
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return grad_in;
}
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='enclib_gpu',
ext_modules=[
CUDAExtension('enclib_gpu', [
'operator.cpp',
'encoding_kernel.cu',
'syncbn_kernel.cu',
'roi_align_kernel.cu',
]),
],
cmdclass={
'build_ext': BuildExtension
})
#include <ATen/ATen.h>
#include <vector>
#include "common.h"
#include "device_tensor.h"
namespace {
template <typename DType, typename Acctype, typename DeviceTensor3>
struct GradOp {
__device__ GradOp(Acctype m, const DeviceTensor3 i, const DeviceTensor3 g)
: mean(m), input(i), gradOutput(g) {}
__device__ __forceinline__ Float2<DType, Acctype> operator()(int batch, int plane, int n) {
DType g = gradOutput[batch][plane][n];
DType c = ScalarConvert<Acctype, DType>::to(input[batch][plane][n] - mean);
return Float2<DType, Acctype>(g, g * c);
}
const Acctype mean;
const DeviceTensor3 input;
const DeviceTensor3 gradOutput;
};
template <typename DType, typename Acctype>
struct SumOp {
__device__ SumOp(DeviceTensor<DType, 3> i) : input(i){}
__device__ __forceinline__ Float2<DType, Acctype> operator()(int batch, int plane, int n) {
DType g = input[batch][plane][n];
return Float2<DType, Acctype>(g, g * g);
}
DType mean;
DeviceTensor<DType, 3> input;
};
// Sum across (batch, x/y/z) applying Op() pointwise
template<typename T, typename Op, typename DeviceTensor3>
__device__ T reduce(Op op, DeviceTensor3 tensor, int plane) {
T sum = (T)0;
for (int batch = 0; batch < tensor.getSize(0); ++batch) {
for (int x = threadIdx.x; x < tensor.getSize(2); x += blockDim.x) {
sum += op(batch, plane, x);
}
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T)0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template <typename DType>
__global__ void BatchNorm_Forward_kernel (
DeviceTensor<DType, 3> output,
DeviceTensor<DType, 3> input,
DeviceTensor<DType, 1> mean,
DeviceTensor<DType, 1> std,
DeviceTensor<DType, 1> gamma,
DeviceTensor<DType, 1> beta) {
int c = blockIdx.x;
/* main operation */
for (int b = 0; b < input.getSize(0); ++b) {
for (int x = threadIdx.x; x < input.getSize(2); x += blockDim.x) {
DType inp = input[b][c][x];
output[b][c][x] = gamma[c] * (inp - mean[c]) /
std[c] + beta[c];
}
}
}
template <typename DType>
__global__ void BatchNorm_Backward_kernel (
DeviceTensor<DType, 3> gradoutput,
DeviceTensor<DType, 3> input,
DeviceTensor<DType, 3> gradinput,
DeviceTensor<DType, 1> gradgamma,
DeviceTensor<DType, 1> gradbeta,
DeviceTensor<DType, 1> mean,
DeviceTensor<DType, 1> std,
DeviceTensor<DType, 1> gamma,
DeviceTensor<DType, 1> beta,
DeviceTensor<DType, 1> gradMean,
DeviceTensor<DType, 1> gradStd,
bool train) {
/* declarations of the variables */
/* Get the index and channels */
int c = blockIdx.x;
/* main operation */
GradOp<DType, DType, DeviceTensor<DType, 3>> g(mean[c], input, gradoutput);
Float2<DType, DType> res = reduce<Float2<DType, DType>,
GradOp<DType, DType, DeviceTensor<DType, 3>>,
DeviceTensor<DType, 3>>(g, gradoutput, c);
DType gradOutputSum = res.v1;
DType dotP = res.v2;
DType invstd = DType(1.0) / std[c];
DType gradScale = invstd * gamma[c];
if (train && threadIdx.x == 0) {
gradMean[c] = - gradOutputSum * gamma[c] * invstd;
gradStd[c] = - dotP * gamma[c] * invstd * invstd;
}
if (gradinput.numElements() > 0) {
for (int batch = 0; batch < gradoutput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradoutput.getSize(2); x += blockDim.x) {
gradinput[batch][c][x] = gradoutput[batch][c][x] * gradScale;
}
}
}
if (gradgamma.numElements() > 0) {
if (threadIdx.x == 0) {
gradgamma[c] += dotP * invstd;
}
}
if (gradbeta.numElements() > 0) {
if (threadIdx.x == 0) {
gradbeta[c] += gradOutputSum;
}
}
}
template <typename DType>
__global__ void Sum_Square_Forward_kernel (
DeviceTensor<DType, 3> input,
DeviceTensor<DType, 1> sum,
DeviceTensor<DType, 1> square) {
int c = blockIdx.x;
/* main operation */
SumOp<DType, DType> g(input);
Float2<DType, DType> res = reduce<Float2<DType, DType>,
SumOp<DType, DType>, DeviceTensor<DType, 3>>(g, input, c);
DType xsum = res.v1;
DType xsquare = res.v2;
if (threadIdx.x == 0) {
sum[c] = xsum;
square[c] = xsquare;
}
}
template <typename DType>
__global__ void Sum_Square_Backward_kernel (
DeviceTensor<DType, 3> gradInput,
DeviceTensor<DType, 3> input,
DeviceTensor<DType, 1> gradSum,
DeviceTensor<DType, 1> gradSquare) {
int c = blockIdx.x;
/* main operation */
for (int batch = 0; batch < gradInput.getSize(0); ++batch) {
for (int x = threadIdx.x; x < gradInput.getSize(2); x += blockDim.x)
{
gradInput[batch][c][x] = gradSum[c] + 2 * gradSquare[c] *
input[batch][c][x];
}
}
}
} // namespcae
at::Tensor BatchNorm_Forward_CUDA(
const at::Tensor input_,
const at::Tensor mean_,
const at::Tensor std_,
const at::Tensor gamma_,
const at::Tensor beta_) {
auto output_ = at::zeros_like(input_);
cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
dim3 blocks(input_.size(1));
dim3 threads(getNumThreads(input_.size(2)));
AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Forward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> output = devicetensor<scalar_t, 3>(output_);
DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
DeviceTensor<scalar_t, 1> mean = devicetensor<scalar_t, 1>(mean_);
DeviceTensor<scalar_t, 1> std = devicetensor<scalar_t, 1>(std_);
DeviceTensor<scalar_t, 1> gamma = devicetensor<scalar_t, 1>(gamma_);
DeviceTensor<scalar_t, 1> beta = devicetensor<scalar_t, 1>(beta_);
/* kernel function */
BatchNorm_Forward_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
output, input, mean, std, gamma, beta);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return output_;
}
std::vector<at::Tensor> BatchNorm_Backward_CUDA(
const at::Tensor gradoutput_,
const at::Tensor input_,
const at::Tensor mean_,
const at::Tensor std_,
const at::Tensor gamma_,
const at::Tensor beta_,
bool train) {
/* outputs*/
at::Tensor gradinput_ = at::zeros_like(input_);
at::Tensor gradgamma_ = at::zeros_like(gamma_);
at::Tensor gradbeta_ = at::zeros_like(beta_);
at::Tensor gradMean_ = at::zeros_like(mean_);
at::Tensor gradStd_ = at::zeros_like(std_);
/* cuda utils*/
cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
dim3 blocks(input_.size(1));
dim3 threads(getNumThreads(input_.size(2)));
AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> gradoutput = devicetensor<scalar_t, 3>(gradoutput_);
DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
DeviceTensor<scalar_t, 3> gradinput = devicetensor<scalar_t, 3>(gradinput_);
DeviceTensor<scalar_t, 1> gradgamma = devicetensor<scalar_t, 1>(gradgamma_);
DeviceTensor<scalar_t, 1> gradbeta = devicetensor<scalar_t, 1>(gradbeta_);
DeviceTensor<scalar_t, 1> mean = devicetensor<scalar_t, 1>(mean_);
DeviceTensor<scalar_t, 1> std = devicetensor<scalar_t, 1>(std_);
DeviceTensor<scalar_t, 1> gamma = devicetensor<scalar_t, 1>(gamma_);
DeviceTensor<scalar_t, 1> beta = devicetensor<scalar_t, 1>(beta_);
DeviceTensor<scalar_t, 1> gradMean = devicetensor<scalar_t, 1>(gradMean_);
DeviceTensor<scalar_t, 1> gradStd = devicetensor<scalar_t, 1>(gradStd_);
/* kernel function */
BatchNorm_Backward_kernel<scalar_t>
<<<blocks, threads, 0, stream>>>(
gradoutput, input, gradinput, gradgamma, gradbeta, mean, std,
gamma, beta, gradMean, gradStd, train);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return {gradinput_, gradMean_, gradStd_, gradgamma_, gradbeta_};
}
std::vector<at::Tensor> Sum_Square_Forward_CUDA(
const at::Tensor input_) {
/* outputs */
at::Tensor sum_ = input_.type().tensor({input_.size(1)}).zero_();
at::Tensor square_ = input_.type().tensor({input_.size(1)}).zero_();
/* cuda utils*/
cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
dim3 blocks(input_.size(1));
dim3 threads(getNumThreads(input_.size(2)));
AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
DeviceTensor<scalar_t, 1> sum = devicetensor<scalar_t, 1>(sum_);
DeviceTensor<scalar_t, 1> square = devicetensor<scalar_t, 1>(square_);
/* kernel function */
Sum_Square_Forward_kernel<scalar_t>
<<<blocks, threads, 0, stream>>>(input, sum, square);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return {sum_, square_};
}
at::Tensor Sum_Square_Backward_CUDA(
const at::Tensor input_,
const at::Tensor gradSum_,
const at::Tensor gradSquare_) {
/* outputs */
at::Tensor gradInput_ = at::zeros_like(input_);
/* cuda utils*/
cudaStream_t stream = at::globalContext().getCurrentCUDAStream();
dim3 blocks(input_.size(1));
dim3 threads(getNumThreads(input_.size(2)));
AT_DISPATCH_FLOATING_TYPES(input_.type(), "BatchNorm_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> gradInput = devicetensor<scalar_t, 3>(gradInput_);
DeviceTensor<scalar_t, 3> input = devicetensor<scalar_t, 3>(input_);
DeviceTensor<scalar_t, 1> gradSum = devicetensor<scalar_t, 1>(gradSum_);
DeviceTensor<scalar_t, 1> gradSquare =devicetensor<scalar_t, 1>(gradSquare_);
/* kernel function */
Sum_Square_Backward_kernel<scalar_t>
<<<blocks, threads, 0, stream>>>(gradInput, input, gradSum, gradSquare);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return gradInput_;
}
import torch
import encoding
# Get the model
model = encoding.models.get_model('fcn_resnet50_ade', pretrained=True).cuda()
model.eval()
# Prepare the image
url = 'https://github.com/zhanghang1989/image-data/blob/master/' + \
'encoding/segmentation/ade20k/ADE_val_00001142.jpg?raw=true'
filename = 'example.jpg'
img = encoding.utils.load_image(
encoding.utils.download(url, filename)).cuda().unsqueeze(0)
# Make prediction
output = model.evaluate(img)
predict = torch.max(output, 1)[1].cpu().numpy() + 1
# Get color pallete for visualization
mask = encoding.utils.get_mask_pallete(predict, 'ade20k')
mask.save('output.png')
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import os
import argparse
import torch
class Options():
def __init__(self):
parser = argparse.ArgumentParser(description='PyTorch \
Segmentation')
# model and dataset
parser.add_argument('--model', type=str, default='encnet',
help='model name (default: encnet)')
parser.add_argument('--backbone', type=str, default='resnet50',
help='backbone name (default: resnet50)')
parser.add_argument('--dataset', type=str, default='ade20k',
help='dataset name (default: pascal12)')
parser.add_argument('--data-folder', type=str,
default=os.path.join(os.environ['HOME'], 'data'),
help='training dataset folder (default: \
$(HOME)/data)')
parser.add_argument('--workers', type=int, default=4,
metavar='N', help='dataloader threads')
# training hyper params
parser.add_argument('--aux', action='store_true', default= False,
help='Auxilary Loss')
parser.add_argument('--se-loss', action='store_true', default= False,
help='Semantic Encoding Loss SE-loss')
parser.add_argument('--epochs', type=int, default=None, metavar='N',
help='number of epochs to train (default: auto)')
parser.add_argument('--start_epoch', type=int, default=0,
metavar='N', help='start epochs (default:0)')
parser.add_argument('--batch-size', type=int, default=None,
metavar='N', help='input batch size for \
training (default: auto)')
parser.add_argument('--test-batch-size', type=int, default=16,
metavar='N', help='input batch size for \
testing (default: 32)')
parser.add_argument('--lr', type=float, default=None, metavar='LR',
help='learning rate (default: auto)')
parser.add_argument('--lr-scheduler', type=str, default='poly',
help='learning rate scheduler (default: poly)')
parser.add_argument('--momentum', type=float, default=0.9,
metavar='M', help='momentum (default: 0.9)')
parser.add_argument('--weight-decay', type=float, default=1e-4,
metavar='M', help='w-decay (default: 1e-4)')
# cuda, seed and logging
parser.add_argument('--no-cuda', action='store_true', default=
False, help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
# checking point
parser.add_argument('--resume', type=str, default=None,
help='put the path to resuming file if needed')
parser.add_argument('--checkname', type=str, default='default',
help='set the checkpoint name')
parser.add_argument('--model-zoo', type=str, default=None,
help='evaluating on model zoo model')
# finetuning pre-trained models
parser.add_argument('--ft', action='store_true', default= False,
help='finetuning on a different dataset')
parser.add_argument('--pre-class', type=int, default=None,
help='num of pre-trained classes \
(default: None)')
# evaluation option
parser.add_argument('--eval', action='store_true', default= False,
help='evaluating mIoU')
parser.add_argument('--no-val', action='store_true', default= False,
help='skip validation during training')
# test option
parser.add_argument('--test-folder', type=str, default=None,
help='path to test image folder')
# the parser
self.parser = parser
def parse(self):
args = self.parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
# default settings for epochs, batch_size and lr
if args.epochs is None:
epoches = {
'pascal_voc': 50,
'pascal_aug': 50,
'pcontext': 50,
'ade20k': 120,
}
args.epochs = epoches[args.dataset.lower()]
if args.batch_size is None:
args.batch_size = 4 * torch.cuda.device_count()
if args.lr is None:
lrs = {
'pascal_voc': 0.0001,
'pascal_aug': 0.001,
'pcontext': 0.001,
'ade20k': 0.01,
}
args.lr = lrs[args.dataset.lower()] / 16 * args.batch_size
return args
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import os
import numpy as np
from tqdm import tqdm
import torch
from torch.utils import data
import torchvision.transforms as transform
from torch.nn.parallel.scatter_gather import gather
import encoding.utils as utils
from encoding.nn import SegmentationLosses, BatchNorm2d
from encoding.parallel import DataParallelModel, DataParallelCriterion
from encoding.datasets import get_segmentation_dataset, test_batchify_fn
from encoding.models import get_model, get_segmentation_model, MultiEvalModule
from option import Options
torch_ver = torch.__version__[:3]
if torch_ver == '0.3':
from torch.autograd import Variable
def test(args):
# output folder
outdir = 'outdir'
if not os.path.exists(outdir):
os.makedirs(outdir)
# data transforms
input_transform = transform.Compose([
transform.ToTensor(),
transform.Normalize([.485, .456, .406], [.229, .224, .225])])
# dataset
if args.eval:
testset = get_segmentation_dataset(args.dataset, split='val', mode='testval',
transform=input_transform)
else:
testset = get_segmentation_dataset(args.dataset, split='test', mode='test',
transform=input_transform)
# dataloader
kwargs = {'num_workers': args.workers, 'pin_memory': True} \
if args.cuda else {}
test_data = data.DataLoader(testset, batch_size=args.batch_size,
drop_last=False, shuffle=False,
collate_fn=test_batchify_fn, **kwargs)
# model
if args.model_zoo is not None:
model = get_model(args.model_zoo, pretrained=True)
else:
model = get_segmentation_model(args.model, dataset=args.dataset,
backbone = args.backbone, aux = args.aux,
se_loss = args.se_loss, norm_layer = BatchNorm2d)
# resuming checkpoint
if args.resume is None or not os.path.isfile(args.resume):
raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume))
checkpoint = torch.load(args.resume)
# strict=False, so that it is compatible with old pytorch saved models
model.load_state_dict(checkpoint['state_dict'], strict=False)
print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
print(model)
evaluator = MultiEvalModule(model, testset.num_class).cuda()
evaluator.eval()
tbar = tqdm(test_data)
def eval_batch(image, dst, evaluator, eval_mode):
if eval_mode:
# evaluation mode on validation set
targets = dst
outputs = evaluator.parallel_forward(image)
batch_inter, batch_union, batch_correct, batch_label = 0, 0, 0, 0
for output, target in zip(outputs, targets):
correct, labeled = utils.batch_pix_accuracy(output.data.cpu(), target)
inter, union = utils.batch_intersection_union(
output.data.cpu(), target, testset.num_class)
batch_correct += correct
batch_label += labeled
batch_inter += inter
batch_union += union
return batch_correct, batch_label, batch_inter, batch_union
else:
# test mode, dump the results
im_paths = dst
outputs = evaluator.parallel_forward(image)
predicts = [torch.max(output, 1)[1].cpu().numpy() + testset.pred_offset
for output in outputs]
for predict, impath in zip(predicts, im_paths):
mask = utils.get_mask_pallete(predict, args.dataset)
outname = os.path.splitext(impath)[0] + '.png'
mask.save(os.path.join(outdir, outname))
# dummy outputs for compatible with eval mode
return 0, 0, 0, 0
total_inter, total_union, total_correct, total_label = \
np.int64(0), np.int64(0), np.int64(0), np.int64(0)
for i, (image, dst) in enumerate(tbar):
if torch_ver == "0.3":
image = Variable(image, volatile=True)
correct, labeled, inter, union = eval_batch(image, dst, evaluator, args.eval)
else:
with torch.no_grad():
correct, labeled, inter, union = eval_batch(image, dst, evaluator, args.eval)
if args.eval:
total_correct += correct
total_label += labeled
total_inter += inter.astype('int64')
total_union += union.astype('int64')
pixAcc = np.float64(1.0) * total_correct / (np.spacing(1, dtype=np.float64) + total_label)
IoU = np.float64(1.0) * total_inter / (np.spacing(1, dtype=np.float64) + total_union)
mIoU = IoU.mean()
tbar.set_description(
'pixAcc: %.4f, mIoU: %.4f' % (pixAcc, mIoU))
if __name__ == "__main__":
args = Options().parse()
torch.manual_seed(args.seed)
args.test_batch_size = torch.cuda.device_count()
test(args)
import importlib
import torch
import encoding
from option import Options
from torch.autograd import Variable
if __name__ == "__main__":
args = Options().parse()
model = encoding.models.get_segmentation_model(args.model, dataset=args.dataset, aux=args.aux,
se_loss=args.se_loss, norm_layer=torch.nn.BatchNorm2d)
print('Creating the model:')
print(model)
model.cuda()
x = Variable(torch.Tensor(4, 3, 480, 480)).cuda()
with torch.no_grad():
out = model(x)
for y in out:
print(y.size())
###########################################################################
# Created by: Hang Zhang
# Email: zhang.hang@rutgers.edu
# Copyright (c) 2017
###########################################################################
import os
import numpy as np
from tqdm import tqdm
import torch
from torch.utils import data
import torchvision.transforms as transform
from torch.nn.parallel.scatter_gather import gather
import encoding.utils as utils
from encoding.nn import SegmentationLosses, BatchNorm2d
from encoding.parallel import DataParallelModel, DataParallelCriterion
from encoding.datasets import get_segmentation_dataset
from encoding.models import get_segmentation_model
from option import Options
torch_ver = torch.__version__[:3]
if torch_ver == '0.3':
from torch.autograd import Variable
class Trainer():
def __init__(self, args):
self.args = args
# data transforms
input_transform = transform.Compose([
transform.ToTensor(),
transform.Normalize([.485, .456, .406], [.229, .224, .225])])
# dataset
trainset = get_segmentation_dataset(args.dataset, split='train',
transform=input_transform)
testset = get_segmentation_dataset(args.dataset, split='val',
transform=input_transform)
# dataloader
kwargs = {'num_workers': args.workers, 'pin_memory': True} \
if args.cuda else {}
self.trainloader = data.DataLoader(trainset, batch_size=args.batch_size,
drop_last=True, shuffle=True, **kwargs)
self.valloader = data.DataLoader(testset, batch_size=args.batch_size,
drop_last=False, shuffle=False, **kwargs)
self.nclass = trainset.num_class
# model
model = get_segmentation_model(args.model, dataset=args.dataset,
backbone = args.backbone, aux = args.aux,
se_loss = args.se_loss, norm_layer = BatchNorm2d)
print(model)
# optimizer using different LR
params_list = [{'params': model.pretrained.parameters(), 'lr': args.lr},]
if hasattr(model, 'head'):
params_list.append({'params': model.head.parameters(), 'lr': args.lr*10})
if hasattr(model, 'auxlayer'):
params_list.append({'params': model.auxlayer.parameters(), 'lr': args.lr*10})
optimizer = torch.optim.SGD(params_list,
lr=args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
# resuming checkpoint
if args.resume is not None:
if not os.path.isfile(args.resume):
raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume))
checkpoint = torch.load(args.resume)
args.start_epoch = checkpoint['epoch']
model.load_state_dict(checkpoint['state_dict'])
if not args.ft:
optimizer.load_state_dict(checkpoint['optimizer'])
best_pred = checkpoint['best_pred']
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
# clear start epoch if fine-tuning
if args.ft:
args.start_epoch = 0
# criterions
self.criterion = SegmentationLosses(se_loss=args.se_loss, aux=args.aux, nclass=self.nclass)
self.model, self.optimizer = model, optimizer
# using cuda
if args.cuda:
self.model = DataParallelModel(self.model).cuda()
self.criterion = DataParallelCriterion(self.criterion).cuda()
# lr scheduler
self.scheduler = utils.LR_Scheduler(args, len(self.trainloader))
self.best_pred = 0.0
def training(self, epoch):
train_loss = 0.0
self.model.train()
tbar = tqdm(self.trainloader)
for i, (image, target) in enumerate(tbar):
self.scheduler(self.optimizer, i, epoch, self.best_pred)
self.optimizer.zero_grad()
if torch_ver == "0.3":
image = Variable(image)
target = Variable(target)
outputs = self.model(image)
loss = self.criterion(outputs, target)
loss.backward()
self.optimizer.step()
train_loss += loss.item()
tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1)))
# save checkpoint every epoch
is_best = False
utils.save_checkpoint({
'epoch': epoch + 1,
'state_dict': self.model.module.state_dict(),
'optimizer': self.optimizer.state_dict(),
'best_pred': self.best_pred,
}, self.args, is_best)
def validation(self, epoch):
# Fast test during the training
def eval_batch(image, target):
outputs = self.model(image)
outputs = gather(outputs, 0, dim=0)
pred = outputs[0]
target = target.cuda()
correct, labeled = utils.batch_pix_accuracy(pred.data, target)
inter, union = utils.batch_intersection_union(pred.data, target, self.nclass)
return correct, labeled, inter, union
is_best = False
self.model.eval()
total_inter, total_union, total_correct, total_label = 0, 0, 0, 0
tbar = tqdm(self.valloader, desc='\r')
for i, (image, target) in enumerate(tbar):
if torch_ver == "0.3":
image = Variable(image, volatile=True)
correct, labeled, inter, union = eval_batch(image, target)
else:
with torch.no_grad():
correct, labeled, inter, union = eval_batch(image, target)
total_correct += correct
total_label += labeled
total_inter += inter
total_union += union
pixAcc = 1.0 * total_correct / (np.spacing(1) + total_label)
IoU = 1.0 * total_inter / (np.spacing(1) + total_union)
mIoU = IoU.mean()
tbar.set_description(
'pixAcc: %.3f, mIoU: %.3f' % (pixAcc, mIoU))
new_pred = (pixAcc + mIoU)/2
if new_pred > self.best_pred:
is_best = True
self.best_pred = new_pred
utils.save_checkpoint({
'epoch': epoch + 1,
'state_dict': self.model.module.state_dict(),
'optimizer': self.optimizer.state_dict(),
'best_pred': self.best_pred,
}, self.args, is_best)
if __name__ == "__main__":
args = Options().parse()
torch.manual_seed(args.seed)
trainer = Trainer(args)
print('Starting Epoch:', args.start_epoch)
print('Total Epoches:', args.epochs)
for epoch in range(args.start_epoch, args.epochs):
trainer.training(epoch)
if not args.no_val:
trainer.validation(epoch)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment