Unverified Commit f8919197 authored by Hang Zhang's avatar Hang Zhang Committed by GitHub
Browse files
parent d4e19553
# ninja log v5
1 1662 1531167904 encoding_cpu.o 6992bc0e0fec6bba
1 1662 1531167904 syncbn_cpu.o f23a0b2f19307f1b
0 2948 1531167905 operator.o 44f004abdc4bd
ninja_required_version = 1.3
cxx = c++
cflags = -DTORCH_EXTENSION_NAME=enclib_cpu -I/anaconda3/lib/python3.6/site-packages/torch/lib/include -I/anaconda3/lib/python3.6/site-packages/torch/lib/include/TH -I/anaconda3/lib/python3.6/site-packages/torch/lib/include/THC -I/anaconda3/include/python3.6m -fPIC -std=c++11
ldflags = -shared -undefined dynamic_lookup
rule compile
command = $cxx -MMD -MF $out.d $cflags -c $in -o $out
depfile = $out.d
deps = gcc
rule link
command = $cxx $ldflags $in -o $out
build operator.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/operator.cpp
build encoding_cpu.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/encoding_cpu.cpp
build syncbn_cpu.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/syncbn_cpu.cpp
build roi_align_cpu.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/roi_align_cpu.cpp
build nms_cpu.o: compile /Users/hzaws/git/PyTorch-Encoding-Layer-/encoding/lib/cpu/nms_cpu.cpp
build enclib_cpu.so: link operator.o encoding_cpu.o syncbn_cpu.o roi_align_cpu.o nms_cpu.o
default enclib_cpu.so
#include <ATen/ATen.h>
#include <vector>
at::Tensor Aggregate_Forward_CPU(
const at::Tensor A,
const at::Tensor X,
const at::Tensor C) {
auto E = (A.unsqueeze(3) * (X.unsqueeze(2).expand({X.size(0), X.size(1),
C.size(0), C.size(1)}) - C.unsqueeze(0).unsqueeze(0))).sum(1);
return E;
}
std::vector<at::Tensor> Aggregate_Backward_CPU(
const at::Tensor GE,
const at::Tensor A,
const at::Tensor X,
const at::Tensor C) {
auto gradA = (GE.unsqueeze(1) * (X.unsqueeze(2).expand({X.size(0), X.size(1),
C.size(0), C.size(1)}) - C.unsqueeze(0).unsqueeze(0))).sum(3);
auto gradX = at::bmm(A, GE);
auto gradC = (-GE * A.sum(1).unsqueeze(2)).sum(0);
return {gradA, gradX, gradC};
}
at::Tensor ScaledL2_Forward_CPU(
const at::Tensor X,
const at::Tensor C,
const at::Tensor S) {
auto SL = S.view({1, 1, C.size(0)}) * (X.unsqueeze(2).expand({X.size(0), X.size(1),
C.size(0), C.size(1)}) - C.unsqueeze(0).unsqueeze(0)).pow(2).sum(3);
return SL;
}
std::vector<at::Tensor> ScaledL2_Backward_CPU(
const at::Tensor GSL,
const at::Tensor X,
const at::Tensor C,
const at::Tensor S,
const at::Tensor SL) {
auto tmp = (2 * GSL * S.view({1, 1, C.size(0)})).unsqueeze(3) *
(X.unsqueeze(2).expand({X.size(0), X.size(1), C.size(0), C.size(1)}) -
C.unsqueeze(0).unsqueeze(0));
auto GX = tmp.sum(2);
auto GC = tmp.sum(0).sum(0);
auto GS = (GSL * (SL / S.view({1, 1, C.size(0)}))).sum(0).sum(0);
return {GX, GC, GS};
}
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#ifdef _OPENMP
#include <omp.h>
#endif
template<typename scalar>
inline scalar IoU(scalar* rawInput, int idx_x, int idx_y) {
scalar lr = std::fmin(rawInput[idx_x*4] + rawInput[idx_x*4+2],
rawInput[idx_y*4] + rawInput[idx_y*4+2]);
scalar rl = std::fmax(rawInput[idx_x*4], rawInput[idx_y*4]);
scalar tb = std::fmin(rawInput[idx_x*4+1] + rawInput[idx_x*4+3],
rawInput[idx_y*4+1] + rawInput[idx_y*4+3]);
scalar bt = std::fmax(rawInput[idx_x*4+1], rawInput[idx_y*4+1]);
scalar inter = std::fmax(0, lr-rl)*std::fmax(0, tb-bt);
scalar uni = (rawInput[idx_x*4+2]*rawInput[idx_x*4+3]
+ rawInput[idx_y*4+2]*rawInput[idx_y*4+3] - inter);
return inter/uni;
}
std::vector<at::Tensor> Non_Max_Suppression_CPU(
const at::Tensor& input,
const at::Tensor& scores,
double thresh) {
AT_ASSERT(input.ndimension() == 3);
AT_ASSERT(scores.ndimension() == 2);
AT_ASSERT(input.size(0) == scores.size(0));
AT_ASSERT(input.size(1) == scores.size(1));
AT_ASSERT(input.size(2) == 4);
AT_ASSERT(input.is_contiguous());
AT_ASSERT(scores.is_contiguous());
AT_ASSERT(input.type().scalarType() == at::kFloat || input.type().scalarType() == at::kDouble)
AT_ASSERT(scores.type().scalarType() == at::kFloat || scores.type().scalarType() == at::kDouble)
AT_ASSERT(input.is_contiguous());
AT_ASSERT(scores.is_contiguous());
at::Tensor sorted_inds = std::get<1>(scores.sort(-1, true));
//at::Tensor rawIdx = std::get<1>(scores.sort(-1, true));
auto num_boxes = input.size(1);
auto batch_size = input.size(0);
auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
mask.fill_(1);
auto *rawMask = mask.data<unsigned char>();
auto *rawIdx = sorted_inds.data<int64_t>();
if (input.type().scalarType() == at::kFloat)
{
auto *rawInput = input.data<float>();
for(int batch=0; batch<batch_size; ++batch)
{
int pos=batch*num_boxes;
while(pos < (1+batch)*num_boxes-1)
{
#pragma omp parallel for
for(int i=pos+1; i<num_boxes*(1+batch); ++i)
{
int idx_x = rawIdx[pos]+num_boxes*batch;
int idx_y = rawIdx[i]+num_boxes*batch;
if (IoU(rawInput, idx_x, idx_y) > thresh)
rawMask[i] = 0;
}
++pos;
while(pos < (1+batch)*num_boxes-1 and (rawMask[pos] == 0))
++pos;
}
}
}
else
{
auto *rawInput = input.data<double>();
for(int batch=0; batch<batch_size; ++batch)
{
int pos=batch*num_boxes;
while(pos < (1+batch)*num_boxes-1)
{
#pragma omp parallel for
for(int i=pos+1; i<num_boxes*(1+batch); ++i)
{
int idx_x = rawIdx[pos]+num_boxes*batch;
int idx_y = rawIdx[i]+num_boxes*batch;
if (IoU(rawInput, idx_x, idx_y) > thresh)
rawMask[i] = 0;
}
++pos;
while(pos < (1+batch)*num_boxes-1 and (rawMask[pos] == 0))
++pos;
}
}
}
//see ./cuda/NonMaxSuppression.cu for comment about return value.
return {mask, sorted_inds};
}
#include "operator.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("roi_align_forward", &ROIAlign_Forward_CPU, "ROI Align forward (CPU)");
m.def("roi_align_backward", &ROIAlign_Backward_CPU, "ROI Align backward (CPU)");
m.def("aggregate_forward", &Aggregate_Forward_CPU, "Aggregate forward (CPU)");
m.def("aggregate_backward", &Aggregate_Backward_CPU, "Aggregate backward (CPU)");
m.def("scaled_l2_forward", &ScaledL2_Forward_CPU, "ScaledL2 forward (CPU)");
m.def("scaled_l2_backward", &ScaledL2_Backward_CPU, "ScaledL2 backward (CPU)");
m.def("batchnorm_forward", &BatchNorm_Forward_CPU, "BatchNorm forward (CPU)");
m.def("batchnorm_backward", &BatchNorm_Backward_CPU, "BatchNorm backward (CPU)");
m.def("sumsquare_forward", &Sum_Square_Forward_CPU, "SumSqu forward (CPU)");
m.def("sumsquare_backward", &Sum_Square_Backward_CPU, "SumSqu backward (CPU)");
m.def("non_max_suppression", &Non_Max_Suppression_CPU, "NMS (CPU)");
}
#include <torch/torch.h>
#include <vector>
at::Tensor ROIAlign_Forward_CPU(
const at::Tensor& input,
const at::Tensor& bottom_rois,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sampling_ratio);
at::Tensor ROIAlign_Backward_CPU(
const at::Tensor& bottom_rois,
const at::Tensor& grad_output,
int64_t b_size,
int64_t channels,
int64_t height,
int64_t width,
int64_t pooled_height,
int64_t pooled_width,
double spatial_scale,
int64_t sampling_ratio);
at::Tensor Aggregate_Forward_CPU(
const at::Tensor A,
const at::Tensor X,
const at::Tensor C);
std::vector<at::Tensor> Aggregate_Backward_CPU(
const at::Tensor GE,
const at::Tensor A,
const at::Tensor X,
const at::Tensor C);
at::Tensor ScaledL2_Forward_CPU(
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor S_);
std::vector<at::Tensor> ScaledL2_Backward_CPU(
const at::Tensor GSL_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor S_,
const at::Tensor SL_);
at::Tensor BatchNorm_Forward_CPU(
const at::Tensor input_,
const at::Tensor mean_,
const at::Tensor std_,
const at::Tensor gamma_,
const at::Tensor beta_);
std::vector<at::Tensor> BatchNorm_Backward_CPU(
const at::Tensor gradoutput_,
const at::Tensor input_,
const at::Tensor mean_,
const at::Tensor std_,
const at::Tensor gamma_,
const at::Tensor beta_,
bool train);
std::vector<at::Tensor> Sum_Square_Forward_CPU(
const at::Tensor input_);
at::Tensor Sum_Square_Backward_CPU(
const at::Tensor input_,
const at::Tensor gradSum_,
const at::Tensor gradSquare_);
std::vector<at::Tensor> Non_Max_Suppression_CPU(
const at::Tensor& input,
const at::Tensor& scores,
double thresh);
...@@ -377,7 +377,7 @@ void ROIAlignBackwardCompute( ...@@ -377,7 +377,7 @@ void ROIAlignBackwardCompute(
} // ROIAlignBackward } // ROIAlignBackward
at::Tensor ROIAlignForwardCPU( at::Tensor ROIAlign_Forward_CPU(
const at::Tensor& input, const at::Tensor& input,
const at::Tensor& bottom_rois, const at::Tensor& bottom_rois,
int64_t pooled_height, int64_t pooled_height,
...@@ -409,7 +409,7 @@ at::Tensor ROIAlignForwardCPU( ...@@ -409,7 +409,7 @@ at::Tensor ROIAlignForwardCPU(
AT_ASSERT(input.is_contiguous()); AT_ASSERT(input.is_contiguous());
AT_ASSERT(bottom_rois.is_contiguous()); AT_ASSERT(bottom_rois.is_contiguous());
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlignForwardCPU", ([&] { AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_Forward_CPU", ([&] {
ROIAlignForwardCompute<scalar_t>( ROIAlignForwardCompute<scalar_t>(
output.numel(), output.numel(),
input.data<scalar_t>(), input.data<scalar_t>(),
...@@ -429,7 +429,7 @@ at::Tensor ROIAlignForwardCPU( ...@@ -429,7 +429,7 @@ at::Tensor ROIAlignForwardCPU(
} }
at::Tensor ROIAlignBackwardCPU( at::Tensor ROIAlign_Backward_CPU(
const at::Tensor& bottom_rois, const at::Tensor& bottom_rois,
const at::Tensor& grad_output, // gradient of the output of the layer const at::Tensor& grad_output, // gradient of the output of the layer
int64_t b_size, int64_t b_size,
...@@ -455,7 +455,7 @@ at::Tensor ROIAlignBackwardCPU( ...@@ -455,7 +455,7 @@ at::Tensor ROIAlignBackwardCPU(
AT_ASSERT(bottom_rois.is_contiguous()); AT_ASSERT(bottom_rois.is_contiguous());
AT_DISPATCH_FLOATING_TYPES(bottom_rois.type(), "ROIAlignBackwardCPU", ([&] { AT_DISPATCH_FLOATING_TYPES(bottom_rois.type(), "ROIAlign_Backward_CPU", ([&] {
ROIAlignBackwardCompute<scalar_t>( ROIAlignBackwardCompute<scalar_t>(
grad_output.numel(), grad_output.numel(),
grad_output.data<scalar_t>(), grad_output.data<scalar_t>(),
......
...@@ -5,8 +5,11 @@ setup( ...@@ -5,8 +5,11 @@ setup(
name='enclib_cpu', name='enclib_cpu',
ext_modules=[ ext_modules=[
CppExtension('enclib_cpu', [ CppExtension('enclib_cpu', [
'roi_align.cpp', 'operator.cpp',
'roi_align_cpu.cpp', 'roi_align_cpu.cpp',
'encoding_cpu.cpp',
'syncbn_cpu.cpp',
'nms_cpu.cpp',
]), ]),
], ],
cmdclass={ cmdclass={
......
#include <ATen/ATen.h>
#include <vector>
at::Tensor broadcast_to(at::Tensor v, at::Tensor x) {
if (x.ndimension() == 2) {
return v;
} else {
std::vector<int64_t> broadcast_size = {1, -1};
for (int64_t i = 2; i < x.ndimension(); ++i)
broadcast_size.push_back(1);
return v.view(broadcast_size);
}
}
at::Tensor BatchNorm_Forward_CPU(
const at::Tensor input,
const at::Tensor mean,
const at::Tensor std,
const at::Tensor gamma,
const at::Tensor beta) {
auto output = (input - broadcast_to(mean, input)) / broadcast_to(std, input);
output = output * broadcast_to(gamma, input) + broadcast_to(beta, input);
return output;
}
// Not implementing CPU backward for now
std::vector<at::Tensor> BatchNorm_Backward_CPU(
const at::Tensor gradoutput,
const at::Tensor input,
const at::Tensor mean,
const at::Tensor std,
const at::Tensor gamma,
const at::Tensor beta,
bool train) {
/* outputs*/
at::Tensor gradinput = at::zeros_like(input);
at::Tensor gradgamma = at::zeros_like(gamma);
at::Tensor gradbeta = at::zeros_like(beta);
at::Tensor gradMean = at::zeros_like(mean);
at::Tensor gradStd = at::zeros_like(std);
return {gradinput, gradMean, gradStd, gradgamma, gradbeta};
}
std::vector<at::Tensor> Sum_Square_Forward_CPU(
const at::Tensor input) {
/* outputs */
at::Tensor sum = input.type().tensor({input.size(1)}).zero_();
at::Tensor square = input.type().tensor({input.size(1)}).zero_();
return {sum, square};
}
at::Tensor Sum_Square_Backward_CPU(
const at::Tensor input,
const at::Tensor gradSum,
const at::Tensor gradSquare) {
/* outputs */
at::Tensor gradInput = at::zeros_like(input);
return gradInput;
}
...@@ -77,3 +77,148 @@ static __device__ __forceinline__ Float2<DType, Acctype> warpSum(Float2<DType, A ...@@ -77,3 +77,148 @@ static __device__ __forceinline__ Float2<DType, Acctype> warpSum(Float2<DType, A
return value; return value;
} }
template<typename T, typename Op>
__device__ T reduceD(
Op op, int b, int i, int k, int D) {
T sum = 0;
for (int x = threadIdx.x; x < D; x += blockDim.x) {
sum += op(b,i,k,x);
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename T, typename Op>
__device__ T reduceN(
Op op, int b, int k, int d, int N) {
T sum = 0;
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(b,x,k,d);
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename T, typename Op>
__device__ T reduceK(
Op op, int b, int i, int d, int K) {
T sum = 0;
for (int x = threadIdx.x; x < K; x += blockDim.x) {
sum += op(b,i,x,d);
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename T, typename Op>
__device__ T reduceBN(
Op op,
int k, int d, int B, int N) {
T sum = 0;
for (int batch = 0; batch < B; ++batch) {
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(batch,x,k,d);
}
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
#include <ATen/ATen.h>
#include <vector> #include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include "common.h" #include "common.h"
#include "device_tensor.h" #include "device_tensor.h"
...@@ -64,153 +65,6 @@ struct SL2GradXOp { ...@@ -64,153 +65,6 @@ struct SL2GradXOp {
DeviceTensor<DType, 1> S; DeviceTensor<DType, 1> S;
}; };
template<typename T, typename Op>
__device__ T reduceN(
Op op, int b, int k, int d, int N) {
T sum = 0;
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(b,x,k,d);
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename T, typename Op>
__device__ T reduceD(
Op op, int b, int i, int k, int D) {
T sum = 0;
for (int x = threadIdx.x; x < D; x += blockDim.x) {
sum += op(b,i,k,x);
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename T, typename Op>
__device__ T reduceK(
Op op, int b, int i, int d, int K) {
T sum = 0;
for (int x = threadIdx.x; x < K; x += blockDim.x) {
sum += op(b,i,x,d);
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename T, typename Op>
__device__ T reduceBN(
Op op,
int k, int d, int B, int N) {
T sum = 0;
for (int batch = 0; batch < B; ++batch) {
for (int x = threadIdx.x; x < N; x += blockDim.x) {
sum += op(batch,x,k,d);
}
}
// sum over NumThreads within a warp
sum = warpSum(sum);
// 'transpose', and reduce within warp again
__shared__ T shared[32];
__syncthreads();
if (threadIdx.x % WARP_SIZE == 0) {
if (threadIdx.x / WARP_SIZE < 32) {
shared[threadIdx.x / WARP_SIZE] = sum;
}
}
if (threadIdx.x >= blockDim.x / WARP_SIZE && threadIdx.x < WARP_SIZE) {
// zero out the other entries in shared
shared[threadIdx.x] = (T) 0;
}
__syncthreads();
if (threadIdx.x / WARP_SIZE == 0) {
sum = warpSum(shared[threadIdx.x]);
if (threadIdx.x == 0) {
shared[0] = sum;
}
}
__syncthreads();
// Everyone picks it up, should be broadcast into the whole gradInput
return shared[0];
}
template<typename DType, typename Acctype> template<typename DType, typename Acctype>
__global__ void Aggregate_Forward_kernel ( __global__ void Aggregate_Forward_kernel (
DeviceTensor<DType, 3> E, DeviceTensor<DType, 3> E,
...@@ -225,7 +79,7 @@ __global__ void Aggregate_Forward_kernel ( ...@@ -225,7 +79,7 @@ __global__ void Aggregate_Forward_kernel (
k = blockIdx.y; k = blockIdx.y;
N = X.getSize(1); N = X.getSize(1);
/* main operation */ /* main operation */
AggOp<DType, Acctype> g(A,X,C); AggOp<DType, Acctype> g(A, X, C);
E[b][k][d] = reduceN<Acctype>(g, b, k, d, N); E[b][k][d] = reduceN<Acctype>(g, b, k, d, N);
} }
...@@ -244,7 +98,7 @@ __global__ void Aggregate_Backward_kernel ( ...@@ -244,7 +98,7 @@ __global__ void Aggregate_Backward_kernel (
k = blockIdx.x; k = blockIdx.x;
D = GE.getSize(2); D = GE.getSize(2);
/* main operation */ /* main operation */
AggBackOp<DType, Acctype> g(GE,X,C); AggBackOp<DType, Acctype> g(GE, X, C);
GA[b][i][k] = reduceD<Acctype>(g, b, i, k, D); GA[b][i][k] = reduceD<Acctype>(g, b, i, k, D);
} }
...@@ -312,7 +166,7 @@ at::Tensor Aggregate_Forward_CUDA( ...@@ -312,7 +166,7 @@ at::Tensor Aggregate_Forward_CUDA(
const at::Tensor C_) { const at::Tensor C_) {
/* Device tensors */ /* Device tensors */
auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_(); auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_();
cudaStream_t stream = at::globalContext().getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// B, K, D // B, K, D
dim3 blocks(C_.size(1), C_.size(0), X_.size(0)); dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
dim3 threads(getNumThreads(X_.size(1))); dim3 threads(getNumThreads(X_.size(1)));
...@@ -338,7 +192,7 @@ std::vector<at::Tensor> Aggregate_Backward_CUDA( ...@@ -338,7 +192,7 @@ std::vector<at::Tensor> Aggregate_Backward_CUDA(
auto gradA_ = at::zeros_like(A_); auto gradA_ = at::zeros_like(A_);
auto gradX_ = at::bmm(A_, GE_); auto gradX_ = at::bmm(A_, GE_);
auto gradC_ = (-GE_ * A_.sum(1).unsqueeze(2)).sum(0); auto gradC_ = (-GE_ * A_.sum(1).unsqueeze(2)).sum(0);
cudaStream_t stream = at::globalContext().getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// B, K, D // B, K, D
dim3 blocks(C_.size(0), X_.size(1), X_.size(0)); dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1))); dim3 threads(getNumThreads(C_.size(1)));
...@@ -361,7 +215,7 @@ at::Tensor ScaledL2_Forward_CUDA( ...@@ -361,7 +215,7 @@ at::Tensor ScaledL2_Forward_CUDA(
const at::Tensor C_, const at::Tensor C_,
const at::Tensor S_) { const at::Tensor S_) {
auto SL_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_(); auto SL_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
cudaStream_t stream = at::globalContext().getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(C_.size(0), X_.size(1), X_.size(0)); dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1))); dim3 threads(getNumThreads(C_.size(1)));
...@@ -388,13 +242,11 @@ std::vector<at::Tensor> ScaledL2_Backward_CUDA( ...@@ -388,13 +242,11 @@ std::vector<at::Tensor> ScaledL2_Backward_CUDA(
auto GX_ = at::zeros_like(X_); auto GX_ = at::zeros_like(X_);
auto GC_ = at::zeros_like(C_); auto GC_ = at::zeros_like(C_);
/* kernel function */ /* kernel function */
cudaStream_t stream = at::globalContext().getCurrentCUDAStream(); cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks1(X_.size(2), X_.size(1), X_.size(0)); dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
dim3 threads1(getNumThreads(C_.size(0))); dim3 threads1(getNumThreads(C_.size(0)));
dim3 blocks2(C_.size(1), C_.size(0)); dim3 blocks2(C_.size(1), C_.size(0));
dim3 threads2(getNumThreads(X_.size(1))); dim3 threads2(getNumThreads(X_.size(1)));
//std::vector<int> size{ 1, 1, K};
//auto GS_ = GSL_ * (SL_ / at::_unsafe_view(S_, size))
auto GS_ = (GSL_ * (SL_ / S_.view({1, 1, C_.size(0)}))).sum(0).sum(0); auto GS_ = (GSL_ * (SL_ / S_.view({1, 1, C_.size(0)}))).sum(0).sum(0);
AT_DISPATCH_FLOATING_TYPES(X_.type(), "ScaledL2_Backward_CUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(X_.type(), "ScaledL2_Backward_CUDA", ([&] {
/* Device tensors */ /* Device tensors */
......
#include <vector>
#include <ATen/ATen.h>
#include <ATen/Functions.h>
#include <ATen/cuda/CUDAContext.h>
#include "common.h"
#include "device_tensor.h"
namespace {
template<typename DType, typename Acctype>
struct KD2Op {
__device__ KD2Op(DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : X(x), C(c), STD(std) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d)
{
DType r = (X[b][i][d] - C[k][d]) / STD[k][d];
return ScalarConvert<DType, Acctype>::to(r * r);
}
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
};
template<typename DType, typename Acctype>
__global__ void Encoding_Dist_Forward_kernel (
DeviceTensor<DType, 3> KD,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
k = blockIdx.x;
i = blockIdx.y;
D = X.getSize(2);
/* main operation */
KD2Op<DType, Acctype> g(X, C, STD);
KD[b][i][k] = reduceD<Acctype>(g, b, i, k, D);;
}
template<typename DType, typename Acctype>
struct EncGradXOp {
__device__ EncGradXOp(
DeviceTensor<DType, 3> gkd,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : GKD(gkd), X(x), C(c), STD(std) {}
// DeviceTensor<DType, 1> s, S(s)
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(
2 * GKD[b][i][k] * (X[b][i][d] - C[k][d]) /
(STD[k][d] * STD[k][d]));
}
DeviceTensor<DType, 3> GKD;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
// DeviceTensor<DType, 1> S;
};
template<typename DType, typename Acctype>
__global__ void Encoding_GradX_kernel (
DeviceTensor<DType, 3> GKD,
DeviceTensor<DType, 3> GX,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
// DeviceTensor<DType, 1> S
/* declarations of the variables */
int b, d, i, K;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
d = blockIdx.x;
K = C.getSize(0);
/* main operation */
EncGradXOp<DType, Acctype> g(GKD, X, C, STD);
GX[b][i][d] = reduceK<Acctype>(g, b, i, d, K);
}
template<typename DType, typename Acctype>
struct EncGradSTDOp {
__device__ EncGradSTDOp(
DeviceTensor<DType, 3> gkd,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : GKD(gkd), X(x), C(c), STD(std) {}
// DeviceTensor<DType, 1> s, S(s)
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(
-2 * GKD[b][i][k] * (X[b][i][d] - C[k][d]) *
(X[b][i][d] - C[k][d]) / (STD[k][d] * STD[k][d] * STD[k][d]));
}
DeviceTensor<DType, 3> GKD;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
// DeviceTensor<DType, 1> S;
};
template<typename DType, typename Acctype>
__global__ void Encoding_GradCSTD_kernel (
DeviceTensor<DType, 3> GKD,
DeviceTensor<DType, 2> GC,
DeviceTensor<DType, 2> GSTD,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
/* declarations of the variables */
int k, d, B, N;
/* Get the index and channels */
d = blockIdx.x;
k = blockIdx.y;
B = X.getSize(0);
N = X.getSize(1);
/* main operation */
EncGradXOp<DType, Acctype> g1(GKD, X, C, STD);
EncGradSTDOp<DType, Acctype> g2(GKD, X, C, STD);
GC[k][d] = -reduceBN<Acctype>(g1, k, d, B, N);
GSTD[k][d] += reduceBN<Acctype>(g2, k, d, B, N);
}
template<typename DType, typename Acctype>
struct EncGradSTDXOp {
__device__ EncGradSTDXOp(
DeviceTensor<DType, 2> gstd,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : GSTD(gstd), X(x), C(c), STD(std) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(
GSTD[k][d] * (X[b][i][d] - C[k][d]) / STD[k][d]);
}
DeviceTensor<DType, 2> GSTD;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
};
template<typename DType, typename Acctype>
__global__ void Encoding_GradSTDX_kernel (
DeviceTensor<DType, 2> GSTD,
DeviceTensor<DType, 3> GX,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD,
int N) {
/* declarations of the variables */
int b, d, i, K;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
d = blockIdx.x;
K = C.getSize(0);
/* main operation */
EncGradSTDXOp<DType, Acctype> g(GSTD, X, C, STD);
GX[b][i][d] += reduceK<Acctype>(g, b, i, d, K) / N;
}
template<typename DType, typename Acctype>
struct AggOpV2 {
__device__ AggOpV2(DeviceTensor<DType, 3> a,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : A(a), X(x), C(c), STD(std) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(A[b][i][k] * (X[b][i][d] - C[k][d]) /
STD[k][d]);
}
DeviceTensor<DType, 3> A;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
};
template<typename DType, typename Acctype>
__global__ void AggregateV2_Forward_kernel (
DeviceTensor<DType, 3> E,
DeviceTensor<DType, 3> A,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
/* declarations of the variables */
int b, k, d, N;
/* Get the index and channels */
b = blockIdx.z;
d = blockIdx.x;
k = blockIdx.y;
N = X.getSize(1);
/* main operation */
AggOpV2<DType, Acctype> g(A, X, C, STD);
E[b][k][d] = reduceN<Acctype>(g, b, k, d, N);
}
template<typename DType, typename Acctype>
struct AggV2BackOp {
__device__ AggV2BackOp(DeviceTensor<DType, 3> g,
DeviceTensor<DType, 3> x,
DeviceTensor<DType, 2> c,
DeviceTensor<DType, 2> std) : G(g), X(x), C(c), STD(std) {}
__device__ __forceinline__ Acctype operator()(int b, int i, int k, int d) {
return ScalarConvert<DType, Acctype>::to(G[b][k][d] * (X[b][i][d] - C[k][d]) /
STD[k][d]);
}
DeviceTensor<DType, 3> G;
DeviceTensor<DType, 3> X;
DeviceTensor<DType, 2> C;
DeviceTensor<DType, 2> STD;
};
template<typename DType, typename Acctype>
__global__ void AggregateV2_Backward_kernel (
DeviceTensor<DType, 3> GA,
DeviceTensor<DType, 3> GE,
DeviceTensor<DType, 3> A,
DeviceTensor<DType, 3> X,
DeviceTensor<DType, 2> C,
DeviceTensor<DType, 2> STD) {
/* declarations of the variables */
int b, k, i, D;
/* Get the index and channels */
b = blockIdx.z;
i = blockIdx.y;
k = blockIdx.x;
D = GE.getSize(2);
/* main operation */
AggV2BackOp<DType, Acctype> g(GE, X, C, STD);
GA[b][i][k] = reduceD<Acctype>(g, b, i, k, D);
}
} // namespace
at::Tensor Encoding_Dist_Inference_Forward_CUDA(
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
// const at::Tensor S_,
// X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
auto KD_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
// E(x), E(x^2)
int N = X_.size(0) * X_.size(1);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1)));
// calculate the kernel distance
AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Inference_Forward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> KD = devicetensor<scalar_t, 3>(KD_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
/* kernel function */
Encoding_Dist_Forward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>> (KD, X, C, STD);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return KD_;
}
std::vector<at::Tensor> Encoding_Dist_Inference_Backward_CUDA(
const at::Tensor GKD_,
const at::Tensor KD_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
auto GX_ = at::zeros_like(X_);
auto GC_ = at::zeros_like(C_);
auto GSTD_ = at::zeros_like(STD_);
/* kernel function */
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
dim3 threads1(getNumThreads(C_.size(0)));
dim3 blocks2(C_.size(1), C_.size(0));
dim3 threads2(getNumThreads(X_.size(1)));
int N = X_.size(0) * X_.size(1);
AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> GKD = devicetensor<scalar_t, 3>(GKD_);
DeviceTensor<scalar_t, 2> GSTD = devicetensor<scalar_t, 2>(GSTD_);
DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
Encoding_GradX_kernel<scalar_t, scalar_t>
<<<blocks1, threads1, 0, stream>>> (GKD, GX, X, C, STD);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
Encoding_GradCSTD_kernel<scalar_t, scalar_t>
<<<blocks2, threads2, 0, stream>>> (GKD, GC, GSTD, X, C, STD);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
}));
return {GX_, GC_, GSTD_};
}
std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
const at::Tensor X_,
const at::Tensor C_,
double eps) {
// const at::Tensor S_,
// X \in R^{B, N, D}, C \in R^{K, D}, S \in R^K
auto KD_ = X_.type().tensor({X_.size(0), X_.size(1), C_.size(0)}).zero_();
// E(x), E(x^2)
int N = X_.size(0) * X_.size(1);
auto SVar_ = (X_.pow(2).sum(0).sum(0).view({1, X_.size(2)}) -
2 * C_ * X_.sum(0).sum(0).view({1, X_.size(2)})).expand_as(C_) +
C_.pow(2) * N;
auto STD_ = at::sqrt(SVar_ / N + eps);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1)));
// calculate the kernel distance
AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Forward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> KD = devicetensor<scalar_t, 3>(KD_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
/* kernel function */
Encoding_Dist_Forward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>> (KD, X, C, STD);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return {KD_, STD_, SVar_ / (N - 1)};
}
std::vector<at::Tensor> Encoding_Dist_Backward_CUDA(
const at::Tensor GKD_,
const at::Tensor GSTD_,
const at::Tensor KD_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
auto GX_ = at::zeros_like(X_);
auto GC_ = at::zeros_like(C_);
auto GSTD2_ = GSTD_.clone();
/* kernel function */
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 blocks1(X_.size(2), X_.size(1), X_.size(0));
dim3 threads1(getNumThreads(C_.size(0)));
dim3 blocks2(C_.size(1), C_.size(0));
dim3 threads2(getNumThreads(X_.size(1)));
int N = X_.size(0) * X_.size(1);
AT_DISPATCH_FLOATING_TYPES(X_.type(), "Encoding_Dist_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> GKD = devicetensor<scalar_t, 3>(GKD_);
DeviceTensor<scalar_t, 2> GSTD = devicetensor<scalar_t, 2>(GSTD2_);
DeviceTensor<scalar_t, 3> GX = devicetensor<scalar_t, 3>(GX_);
DeviceTensor<scalar_t, 2> GC = devicetensor<scalar_t, 2>(GC_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
Encoding_GradX_kernel<scalar_t, scalar_t>
<<<blocks1, threads1, 0, stream>>> (GKD, GX, X, C, STD);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
Encoding_GradCSTD_kernel<scalar_t, scalar_t>
<<<blocks2, threads2, 0, stream>>> (GKD, GC, GSTD, X, C, STD);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
Encoding_GradSTDX_kernel<scalar_t, scalar_t>
<<<blocks1, threads1, 0, stream>>> (GSTD, GX, X, C, STD, N);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
}));
// d_sigma/d_c
GC_ = GC_ - GSTD2_ * (X_.mean(0).mean(0) - C_) / STD_;
return {GX_, GC_};
}
at::Tensor AggregateV2_Forward_CUDA(
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
/* Device tensors */
auto E_ = A_.type().tensor({A_.size(0), C_.size(0), C_.size(1)}).zero_();
// auto IS_ = 1.0f / (S_ + eps).sqrt();
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// B, K, D
dim3 blocks(C_.size(1), C_.size(0), X_.size(0));
dim3 threads(getNumThreads(X_.size(1)));
AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Forward_CUDA", ([&] {
DeviceTensor<scalar_t, 3> E = devicetensor<scalar_t, 3>(E_);
DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
/* kernel function */
AggregateV2_Forward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>>(E, A, X, C, STD);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return E_;
}
std::vector<at::Tensor> AggregateV2_Backward_CUDA(
const at::Tensor GE_,
const at::Tensor E_,
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_) {
auto gradA_ = at::zeros_like(A_);
auto gradX_ = at::bmm(A_ , (GE_ / STD_.unsqueeze(0)));
auto gradC_ = -(A_.sum(1).unsqueeze(2) * GE_ / STD_.unsqueeze(0)).sum(0);
auto gradSTD_ = -(GE_ * E_).sum(0) / STD_;
// auto gradS_ = -0.5 * (GE_ * E_).sum(2).sum(0) / S_;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
// B, K, D
dim3 blocks(C_.size(0), X_.size(1), X_.size(0));
dim3 threads(getNumThreads(C_.size(1)));
AT_DISPATCH_FLOATING_TYPES(A_.type(), "Aggregate_Backward_CUDA", ([&] {
/* Device tensors */
DeviceTensor<scalar_t, 3> GA = devicetensor<scalar_t, 3>(gradA_);
DeviceTensor<scalar_t, 3> GE = devicetensor<scalar_t, 3>(GE_);
DeviceTensor<scalar_t, 3> A = devicetensor<scalar_t, 3>(A_);
DeviceTensor<scalar_t, 3> X = devicetensor<scalar_t, 3>(X_);
DeviceTensor<scalar_t, 2> C = devicetensor<scalar_t, 2>(C_);
DeviceTensor<scalar_t, 2> STD = devicetensor<scalar_t, 2>(STD_);
AggregateV2_Backward_kernel<scalar_t, scalar_t>
<<<blocks, threads, 0, stream>>> (GA, GE, A, X, C, STD);
}));
AT_ASSERT(cudaGetLastError() == cudaSuccess);
return {gradA_, gradX_, gradC_, gradSTD_};
}
#include <ATen/ATen.h>
#include "ATen/NativeFunctions.h"
#include <ATen/cuda/CUDAContext.h>
template <typename scalar>
__device__ __forceinline__ scalar fmin(scalar a, scalar b) {
return a > b ? b : a;
}
template <typename scalar>
__device__ __forceinline__ scalar fmax(scalar a, scalar b) {
return a > b ? a : b;
}
template <typename scalar>
__device__ __forceinline__ scalar IoU(const scalar* box_x, const scalar* box_y) {
// Calculate IoU between the boxes.
scalar rightmost_l = fmax(box_x[0], box_y[0]);
scalar leftmost_r = fmin(box_x[0] + box_x[2], box_y[0] + box_y[2]);
scalar delta_x = fmax((scalar)0., leftmost_r - rightmost_l);
scalar bottommost_tp = fmax(box_x[1], box_y[1]);
scalar topmost_b = fmin(box_x[1] + box_x[3], box_y[1] + box_y[3]);
scalar delta_y = fmax((scalar)0., topmost_b - bottommost_tp);
scalar uni = box_x[2] * box_x[3] + box_y[2] * box_y[3];
return delta_x * delta_y / (uni - delta_x * delta_y);
}
template <typename scalar>
__global__ void nms_kernel(unsigned char* mask,
const scalar* boxes,
const int64_t* inds,
const int64_t num_boxes,
double thresh) {
//A pretty straightforward implementation, analogous to the standard serial
//version but with the IoUs computed and mask updated in parallel. We access
//the box data through an array of sorted indices rather than physically
//sorting it: unless one has an inordinate number of boxes (O(10^5), whereas
//for example in the faster rcnn paper they feed 6000 per batch) the
//data will fit in L2 so sorting it won't actually reduce the number of
//messy reads from global.
int col = 0;
while(col < num_boxes-1)
{
for(int i = threadIdx.x; i < num_boxes-1; i+=blockDim.x)
if(i >= col)
{
scalar iou = IoU(&boxes[4*inds[i+1+num_boxes*blockIdx.x] + 4*num_boxes*blockIdx.x],
&boxes[4*inds[col+num_boxes*blockIdx.x] + 4*num_boxes*blockIdx.x]);
mask[i+1+blockIdx.x*num_boxes] *= (iou>thresh) ? 0 : 1;
}
__syncthreads();
++col;
while((col < num_boxes - 1) && (mask[col+blockIdx.x*num_boxes]==0))
++col;
}
}
std::vector<at::Tensor> Non_Max_Suppression_CUDA(
const at::Tensor& input,
const at::Tensor& scores,
double thresh) {
AT_ASSERT(input.ndimension() == 3);
AT_ASSERT(scores.ndimension() == 2);
AT_ASSERT(input.size(0) == scores.size(0));
AT_ASSERT(input.size(1) == scores.size(1));
AT_ASSERT(input.size(2) == 4);
AT_ASSERT(input.is_contiguous());
AT_ASSERT(scores.is_contiguous());
AT_ASSERT(input.type().scalarType() == at::kFloat || input.type().scalarType() == at::kDouble)
AT_ASSERT(scores.type().scalarType() == at::kFloat || scores.type().scalarType() == at::kDouble)
auto num_boxes = input.size(1);
auto batch_size = input.size(0);
auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
mask.fill_(1);
//need the indices of the boxes sorted by score.
at::Tensor sorted_inds = std::get<1>(scores.sort(-1, true));
dim3 mask_block(512); //would be nice to have 1024 here for gpus that support it,
//but not sure how to do this cleanly without calling
//cudaGetDeviceProperties in the funcion body...
dim3 mask_grid(batch_size);
if(input.type().scalarType() == at::kFloat)
{
nms_kernel<<<mask_grid, mask_block, 0, at::cuda::getCurrentCUDAStream()>>>(
mask.data<unsigned char>(),
input.data<float>(),
sorted_inds.data<int64_t>(),
num_boxes,
thresh);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
}
else
{
nms_kernel<<<mask_grid, mask_block, 0, at::cuda::getCurrentCUDAStream()>>>(
mask.data<unsigned char>(),
input.data<double>(),
sorted_inds.data<int64_t>(),
num_boxes,
thresh);
AT_ASSERT(cudaGetLastError() == cudaSuccess);
}
//It's not entirely clear what the best thing to return is here. The algorithm will
//produce a different number of boxes for each batch, so there is no obvious way of
//way of returning the surving boxes/indices as a tensor. Returning a mask on the
//sorted boxes together with the sorted indices seems reasonable; that way, the user
//can easily take the N highest-scoring surviving boxes to form a tensor if they wish.
return {mask, sorted_inds};
}
#include "operator.h" #include "operator.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("roi_align_forward", &ROIAlignForwardCUDA, "ROI Align forward (CUDA)"); m.def("roi_align_forward", &ROIAlign_Forward_CUDA, "ROI Align forward (CUDA)");
m.def("roi_align_backward", &ROIAlignBackwardCUDA, "ROI Align backward (CUDA)"); m.def("roi_align_backward", &ROIAlign_Backward_CUDA, "ROI Align backward (CUDA)");
m.def("non_max_suppression", &Non_Max_Suppression_CUDA, "NMS (CUDA)");
m.def("aggregate_forward", &Aggregate_Forward_CUDA, "Aggregate forward (CUDA)"); m.def("aggregate_forward", &Aggregate_Forward_CUDA, "Aggregate forward (CUDA)");
m.def("aggregate_backward", &Aggregate_Backward_CUDA, "Aggregate backward (CUDA)"); m.def("aggregate_backward", &Aggregate_Backward_CUDA, "Aggregate backward (CUDA)");
m.def("scaled_l2_forward", &ScaledL2_Forward_CUDA, "ScaledL2 forward (CUDA)"); m.def("scaled_l2_forward", &ScaledL2_Forward_CUDA, "ScaledL2 forward (CUDA)");
...@@ -11,4 +12,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ...@@ -11,4 +12,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("batchnorm_backward", &BatchNorm_Backward_CUDA, "BatchNorm backward (CUDA)"); m.def("batchnorm_backward", &BatchNorm_Backward_CUDA, "BatchNorm backward (CUDA)");
m.def("sumsquare_forward", &Sum_Square_Forward_CUDA, "SumSqu forward (CUDA)"); m.def("sumsquare_forward", &Sum_Square_Forward_CUDA, "SumSqu forward (CUDA)");
m.def("sumsquare_backward", &Sum_Square_Backward_CUDA, "SumSqu backward (CUDA)"); m.def("sumsquare_backward", &Sum_Square_Backward_CUDA, "SumSqu backward (CUDA)");
m.def("encoding_dist_forward", &Encoding_Dist_Forward_CUDA, "EncDist forward (CUDA)");
m.def("encoding_dist_backward", &Encoding_Dist_Backward_CUDA, "Assign backward (CUDA)");
m.def("encoding_dist_inference_forward", &Encoding_Dist_Inference_Forward_CUDA,
"EncDist Inference forward (CUDA)");
m.def("encoding_dist_inference_backward", &Encoding_Dist_Inference_Backward_CUDA,
"Assign Inference backward (CUDA)");
m.def("aggregatev2_forward", &AggregateV2_Forward_CUDA, "AggregateV2 forward (CUDA)");
m.def("aggregatev2_backward", &AggregateV2_Backward_CUDA, "AggregateV2 backward (CUDA)");
} }
#include <torch/torch.h> #include <torch/torch.h>
#include <vector> #include <vector>
at::Tensor ROIAlignForwardCUDA( at::Tensor ROIAlign_Forward_CUDA(
const at::Tensor input, const at::Tensor input,
const at::Tensor rois, const at::Tensor rois,
int64_t pooled_height, int64_t pooled_height,
...@@ -9,7 +9,7 @@ at::Tensor ROIAlignForwardCUDA( ...@@ -9,7 +9,7 @@ at::Tensor ROIAlignForwardCUDA(
double spatial_scale, double spatial_scale,
int64_t sample_ratio); int64_t sample_ratio);
at::Tensor ROIAlignBackwardCUDA( at::Tensor ROIAlign_Backward_CUDA(
const at::Tensor rois, const at::Tensor rois,
const at::Tensor grad_output, const at::Tensor grad_output,
int64_t b_size, int64_t b_size,
...@@ -21,6 +21,11 @@ at::Tensor ROIAlignBackwardCUDA( ...@@ -21,6 +21,11 @@ at::Tensor ROIAlignBackwardCUDA(
double spatial_scale, double spatial_scale,
int64_t sampling_ratio); int64_t sampling_ratio);
std::vector<at::Tensor> Non_Max_Suppression_CUDA(
const at::Tensor& input,
const at::Tensor& scores,
double thresh);
at::Tensor Aggregate_Forward_CUDA( at::Tensor Aggregate_Forward_CUDA(
const at::Tensor A_, const at::Tensor A_,
const at::Tensor X_, const at::Tensor X_,
...@@ -67,3 +72,42 @@ at::Tensor Sum_Square_Backward_CUDA( ...@@ -67,3 +72,42 @@ at::Tensor Sum_Square_Backward_CUDA(
const at::Tensor input_, const at::Tensor input_,
const at::Tensor gradSum_, const at::Tensor gradSum_,
const at::Tensor gradSquare_); const at::Tensor gradSquare_);
at::Tensor Encoding_Dist_Inference_Forward_CUDA(
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
std::vector<at::Tensor> Encoding_Dist_Inference_Backward_CUDA(
const at::Tensor GKD_,
const at::Tensor KD_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
std::vector<at::Tensor> Encoding_Dist_Forward_CUDA(
const at::Tensor X,
const at::Tensor C,
double eps);
std::vector<at::Tensor> Encoding_Dist_Backward_CUDA(
const at::Tensor GKD_,
const at::Tensor GSTD_,
const at::Tensor KD_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
at::Tensor AggregateV2_Forward_CUDA(
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
std::vector<at::Tensor> AggregateV2_Backward_CUDA(
const at::Tensor GE_,
const at::Tensor E_,
const at::Tensor A_,
const at::Tensor X_,
const at::Tensor C_,
const at::Tensor STD_);
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -346,7 +347,7 @@ __global__ void RoIAlignBackwardKernel( ...@@ -346,7 +347,7 @@ __global__ void RoIAlignBackwardKernel(
} // namespace } // namespace
at::Tensor ROIAlignForwardCUDA( at::Tensor ROIAlign_Forward_CUDA(
const at::Tensor input, const at::Tensor input,
const at::Tensor rois, const at::Tensor rois,
int64_t pooled_height, int64_t pooled_height,
...@@ -370,12 +371,12 @@ at::Tensor ROIAlignForwardCUDA( ...@@ -370,12 +371,12 @@ at::Tensor ROIAlignForwardCUDA(
auto count = output.numel(); auto count = output.numel();
AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlignForwardCUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_Forward_CUDA", ([&] {
RoIAlignForwardKernel<scalar_t> RoIAlignForwardKernel<scalar_t>
<<<ROI_GET_BLOCKS(count), <<<ROI_GET_BLOCKS(count),
ROI_CUDA_NUM_THREADS, ROI_CUDA_NUM_THREADS,
0, 0,
at::globalContext().getCurrentCUDAStream()>>>( at::cuda::getCurrentCUDAStream()>>>(
count, count,
input.data<scalar_t>(), input.data<scalar_t>(),
static_cast<scalar_t>(spatial_scale), static_cast<scalar_t>(spatial_scale),
...@@ -392,7 +393,7 @@ at::Tensor ROIAlignForwardCUDA( ...@@ -392,7 +393,7 @@ at::Tensor ROIAlignForwardCUDA(
return output; return output;
} }
at::Tensor ROIAlignBackwardCUDA( at::Tensor ROIAlign_Backward_CUDA(
const at::Tensor rois, const at::Tensor rois,
const at::Tensor grad_output, const at::Tensor grad_output,
int64_t b_size, int64_t b_size,
...@@ -417,12 +418,12 @@ at::Tensor ROIAlignBackwardCUDA( ...@@ -417,12 +418,12 @@ at::Tensor ROIAlignBackwardCUDA(
auto num_rois = rois.size(0); auto num_rois = rois.size(0);
auto count = grad_output.numel(); auto count = grad_output.numel();
AT_DISPATCH_FLOATING_TYPES(rois.type(), "ROIAlignBackwardCUDA", ([&] { AT_DISPATCH_FLOATING_TYPES(rois.type(), "ROIAlign_Backward_CUDA", ([&] {
RoIAlignBackwardKernel<scalar_t> RoIAlignBackwardKernel<scalar_t>
<<<ROI_GET_BLOCKS(count), <<<ROI_GET_BLOCKS(count),
ROI_CUDA_NUM_THREADS, ROI_CUDA_NUM_THREADS,
0, 0,
at::globalContext().getCurrentCUDAStream()>>>( at::cuda::getCurrentCUDAStream()>>>(
count, count,
grad_output.data<scalar_t>(), grad_output.data<scalar_t>(),
num_rois, num_rois,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment