"tools/vscode:/vscode.git/clone" did not exist on "6bd8be71a2d7346fce048df9878eb10e37a3869b"
Commit 2c4ed608 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

Goodbye THNN. Hello ATen!

parent 6d4475db
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "BatchNormalization.h"
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<T, N, 64>( \
input_features.data<T>(), output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum, \
train, leakiness); \
}
template <typename T>
void cuda_BatchNormalization_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor saveMean,
/*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
/*cuda float*/ at::Tensor runningVar,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
T momentum, bool train, T leakiness) {
output_features.resize_as_(input_features);
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
}
template <typename T>
void cuda_BatchNormalizationInTensor_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor saveMean,
/*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
/*cuda float*/ at::Tensor runningVar,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
T momentum, bool train, T leakiness) {
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
}
#undef BN_F_MACRO
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<T, N, 64>( \
input_features.data<T>(), d_input_features.data<T>(), \
output_features.data<T>(), d_output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), \
OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness); \
}
template <typename T>
void cuda_BatchNormalization_backward(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor saveMean, /*cuda float*/ at::Tensor saveInvStd,
/*cuda float*/ at::Tensor runningMean,
/*cuda float*/ at::Tensor runningVar, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias,
/*cuda float*/ at::Tensor d_weight, /*cuda float*/ at::Tensor d_bias,
T leakiness) {
d_input_features.resize_as_(d_output_features);
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
BN_B_MACRO(16)
else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
}
}
......@@ -4,9 +4,9 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHNORMALIZATION_H
#define GPU_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#ifndef CUDA_BATCHNORMALIZATION_H
#define CUDA_BATCHNORMALIZATION_H
#include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger
......@@ -14,22 +14,22 @@
// NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void
BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
uInt input_stride, uInt output_stride, uInt nActive,
BatchNormalization_f_train(T *input_features, T *output_features, Int nPlanes,
Int input_stride, Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum,
T leakiness) {
__shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0;
for (uInt row = threadIdx.y, c = plane + threadIdx.y * input_stride;
for (Int row = threadIdx.y, c = plane + threadIdx.y * input_stride;
row < nActive; row += NTY, c += input_stride * NTY) {
T i = input_features[c];
t[threadIdx.y][threadIdx.x] += i;
......@@ -38,7 +38,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads();
T _saveMean = 0;
T _saveInvStd = 0;
for (uInt row = 0; row < NTY; row++) {
for (Int row = 0; row < NTY; row++) {
_saveMean += t[row][threadIdx.x];
_saveInvStd += t2[row][threadIdx.x];
}
......@@ -65,7 +65,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
T W = t[0][threadIdx.x];
T B = t[1][threadIdx.x];
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
......@@ -75,16 +75,16 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads();
}
}
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void
BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
uInt input_stride, uInt output_stride, uInt nActive,
BatchNormalization_f_test(T *input_features, T *output_features, Int nPlanes,
Int input_stride, Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum,
T leakiness) {
__shared__ T W[NTX];
__shared__ T B[NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0) {
W[threadIdx.x] =
......@@ -95,7 +95,7 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
__syncthreads();
float w = W[threadIdx.x], b = B[threadIdx.x];
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
......@@ -106,40 +106,38 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
}
}
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
void BatchNormalization_ForwardPass(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt nActive,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps,
T momentum, bool train, T leakiness) {
if (train) {
BatchNormalization_f_train<
T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0,
THCState_getCurrentStream(state)>>>(
T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness);
} else {
BatchNormalization_f_test<
T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0,
THCState_getCurrentStream(state)>>>(
T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness);
}
}
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void
BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T *d_output_features, uInt nPlanes, uInt input_stride,
uInt output_stride, uInt nActive, T *saveMean,
T *d_output_features, Int nPlanes, Int input_stride,
Int output_stride, Int nActive, T *saveMean,
T *saveInvStd, T *runningMean, T *runningVar, T *weight,
T *bias, T *d_weight, T *d_bias, T leakiness) {
__shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0) {
t[0][threadIdx.x] = saveMean[plane];
......@@ -153,7 +151,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
__syncthreads();
t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0;
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
......@@ -180,7 +178,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T k = dotp * _saveInvStd * _saveInvStd / nActive;
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride;
row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
......@@ -192,17 +190,16 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
}
}
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt nActive,
Int nPlanes, Int input_stride,
Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias,
T *d_weight, T *d_bias, T leakiness) {
BatchNormalization_b<T, NTX,
NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY),
0, THCState_getCurrentStream(state)>>>(
BatchNormalization_b<
T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,
runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);
......@@ -210,4 +207,4 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
#undef NTX
#undef NTY
#endif /* GPU_BATCHNORMALIZATION_H */
#endif /* CUDA_BATCHNORMALIZATION_H */
......@@ -4,32 +4,28 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
#else
#include "BatchwiseMultiplicativeDropout.h"
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
BatchwiseMultiplicativeDropout_fp< \
T, NTX, \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features.data<T>(), output_features.data<T>(), \
noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha); \
return; \
} \
}
extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
THCTensor *input_features, THCTensor *output_features, THCTensor *noise,
template <typename T>
void cuda_BatchwiseMultiplicativeDropout_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor noise,
float alpha) {
if (input_features != output_features)
THCTensor_(resizeAs)(state, output_features, input_features);
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
output_features.resize_as_(input_features);
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
SPARSECONVNET_FOO(32, 32)
SPARSECONVNET_FOO(24, 32)
SPARSECONVNET_FOO(16, 64)
......@@ -43,24 +39,24 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
#define SPARSECONVNET_FOO(NTX, NTY) \
{ \
if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << < \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \
THCState_getCurrentStream(state)>>> \
(THCTensor_(data)(state, input_features), \
THCTensor_(data)(state, d_input_features), \
THCTensor_(data)(state, d_output_features), \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
BatchwiseMultiplicativeDropout_bp< \
T, NTX, \
NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
input_features.data<T>(), d_input_features.data<T>(), \
d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, \
nPlanes, nPlanes, alpha); \
return; \
} \
}
extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
THCTensor *input_features, THCTensor *d_input_features,
THCTensor *d_output_features, THCTensor *noise, float alpha) {
if (d_input_features != d_output_features)
THCTensor_(resizeAs)(state, d_input_features, d_output_features);
auto nActive = input_features->size[0];
auto nPlanes = input_features->size[1];
template <typename T>
void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor noise, float alpha) {
d_input_features.resize_as_(d_output_features);
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
SPARSECONVNET_FOO(32, 32)
SPARSECONVNET_FOO(24, 32)
......@@ -71,5 +67,3 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
SPARSECONVNET_FOO(1, 64)
}
#undef SPARSECONVNET_FOO
#endif
......@@ -4,50 +4,50 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H
template <typename T, uInt NTX, uInt NTY>
#ifndef CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
template <typename T, Int NTX, Int NTY>
__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
T *output_features, T *noise,
uInt nActive, uInt nPlanes,
uInt input_stride,
uInt output_stride, T alpha) {
Int nActive, Int nPlanes,
Int input_stride,
Int output_stride, T alpha) {
__shared__ T nz[NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive;
for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
uInt i = row * input_stride + plane;
uInt o = row * output_stride + plane;
Int i = row * input_stride + plane;
Int o = row * output_stride + plane;
output_features[o] = input_features[i] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void
BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features,
T *d_output_features, T *noise, uInt nActive,
uInt nPlanes, uInt input_stride,
uInt output_stride, T alpha) {
T *d_output_features, T *noise, Int nActive,
Int nPlanes, Int input_stride,
Int output_stride, T alpha) {
__shared__ T nz[NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) {
if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane];
__syncthreads();
for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive;
for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) {
uInt i = row * input_stride + plane;
uInt o = row * output_stride + plane;
Int i = row * input_stride + plane;
Int o = row * output_stride + plane;
d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha);
}
__syncthreads();
}
}
#endif /* GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H */
#endif /* CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "RuleBookIterator.h"
template <typename T, Int Dimension>
double cuda_Convolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
double flops = 0;
if (nActive) {
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
if (bias.numel()) {
auto b = bias.data<T>();
for (Int i = 0; i < op; i += 32) {
Int blockDim = min((Int)32, op - i);
Int gridDim = min((Int)4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
nActive);
}
}
Int c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T, Int Dimension>
void cuda_Convolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive) {
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
template <typename T, Int Dimension>
double cuda_SubmanifoldConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
output_features.resize_({nActive, weight.size(2)});
if (bias.numel() and nActive)
output_features.copy_(bias);
else
output_features.zero_();
double flops = 0;
if (nActive) {
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
// if (bias.numel()) {
// auto b = bias.data<T>();
// for (Int i = 0; i < op; i += 32) {
// Int blockDim = min((Int)32, op - i);
// Int gridDim = min((Int)4096, nActive);
// Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
// nActive);
// }
// }
Int c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T, Int Dimension>
void cuda_SubmanifoldConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive) {
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
template <typename T, Int Dimension>
double cuda_FullConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActive = mOut.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
double flops = 0;
if (nActive) {
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
if (bias.numel()) {
auto b = bias.data<T>();
for (Int i = 0; i < op; i += 32) {
Int blockDim = min((Int)32, op - i);
Int gridDim = min((Int)4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
nActive);
}
}
Int c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T, Int Dimension>
void cuda_FullConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActive = mOut.getNActive(outputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive) {
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
template <typename T, Int Dimension>
double cuda_RandomizedStrideConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
double flops = 0;
if (nActive) {
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
if (bias.numel()) {
auto b = bias.data<T>();
for (Int i = 0; i < op; i += 32) {
Int blockDim = min((Int)32, op - i);
Int gridDim = min((Int)4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
nActive);
}
}
Int c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T, Int Dimension>
void cuda_RandomizedStrideConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActive = m.getNActive(outputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive) {
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
......@@ -4,47 +4,47 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_CONVOLUTION_H
#define GPU_CONVOLUTION_H
#include "../SparseConvNet.h"
#ifndef CUDA_CONVOLUTION_H
#define CUDA_CONVOLUTION_H
template <typename T>
__global__ void Convolution_fp_bias(T *output_features, T *bias, uInt nPlanes,
uInt output_stride, uInt nActive) {
__global__ void Convolution_fp_bias(T *output_features, T *bias, Int nPlanes,
Int output_stride, Int nActive) {
__shared__ T b[32];
b[threadIdx.x] = bias[threadIdx.x];
for (uInt row = blockIdx.x; row < nActive; row += 1 << 12) {
for (Int row = blockIdx.x; row < nActive; row += 1 << 12) {
output_features[row * output_stride + threadIdx.x] = b[threadIdx.x];
}
}
template <typename T>
__global__ void dColumnSum(T *matrix, T *target, uInt nRows, uInt nColumns,
uInt nCOLUMNS) {
uInt i = blockIdx.x * 32 + threadIdx.x;
__global__ void dColumnSum(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS) {
Int i = blockIdx.x * 32 + threadIdx.x;
T t = 0;
for (uInt j = blockIdx.y; j < nRows; j += 32)
for (Int j = blockIdx.y; j < nRows; j += 32)
t += matrix[j * nCOLUMNS + i];
atomicAdd(&target[i], t);
}
template <typename T>
void Convolution_bp_bias(T *matrix, T *target, uInt nRows, uInt nColumns,
uInt nCOLUMNS, cudaStream_t stream) {
void Convolution_bp_bias(T *matrix, T *target, Int nRows, Int nColumns,
Int nCOLUMNS) {
if (nColumns / 32 > 0)
dColumnSum << <dim3(nColumns / 32, 32), 32, 0, stream>>>
(matrix, target, nRows, nColumns, nCOLUMNS);
dColumnSum<<<dim3(nColumns / 32, 32), 32>>>(
matrix, target, nRows, nColumns, nCOLUMNS);
if (nColumns % 32 > 0) {
uInt o = nColumns / 32 * 32;
dColumnSum << <dim3(1, 32), nColumns - o, 0, stream>>>
(matrix + o, target + o, nRows, nColumns, nCOLUMNS);
Int o = nColumns / 32 * 32;
dColumnSum<<<dim3(1, 32), nColumns - o>>>(
matrix + o, target + o, nRows, nColumns, nCOLUMNS);
}
}
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
......@@ -53,17 +53,17 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt M = input_nPlanes / K;
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
uInt R0[V];
uInt R1[V];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -76,7 +76,7 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])];
......@@ -110,28 +110,28 @@ dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
inFeatures += K;
}
}
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt M = input_nPlanes / K;
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
uInt R0[V];
uInt R1[V];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -144,7 +144,7 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
......@@ -187,27 +187,27 @@ dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules, o, input_nPlanes, \
input_stride, output_nPlanes, output_stride); \
dConvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_forwardB<T, K, V> << <dim3(1, output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dConvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
......@@ -216,10 +216,9 @@ void dConvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template <>
void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
......@@ -230,15 +229,15 @@ void dConvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
......@@ -249,8 +248,8 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
uInt R0[V];
uInt R1[V];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -265,7 +264,7 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW[v] = 0;
}
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R0[v] = rules[2 * (s + ty[v])];
......@@ -307,15 +306,15 @@ dConvolution_KMxKN_backward_dW_A(T *inFeatures, T *dInFeatures, T *dOutFeatures,
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
......@@ -326,8 +325,8 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
uInt R0[V];
uInt R1[V];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -342,7 +341,7 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW[v] = 0;
}
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
......@@ -392,29 +391,29 @@ dConvolution_KMxKN_backward_dW_B(T *inFeatures, T *dInFeatures, T *dOutFeatures,
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), input_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dConvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
dConvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
......@@ -422,11 +421,11 @@ void dConvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
}
#undef FOO
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
......@@ -434,17 +433,17 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
uInt M = (input_nPlanes + K - 1) / K;
Int M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K
uInt n = blockIdx.y;
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
uInt KO = min(K, output_nPlanes - K * n);
Int KO = min(K, output_nPlanes - K * n);
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ uInt R[K * 2];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -452,7 +451,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
uInt KI = min(K, input_nPlanes - K * m);
Int KI = min(K, input_nPlanes - K * m);
// Read w
#pragma unroll
......@@ -460,7 +459,7 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int v = 0; v < V; v++) {
......@@ -500,48 +499,47 @@ dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template <typename T>
void dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
void dConvolution_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dConvolution_KMxKN_forward2<T, K, V> << <
dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
(inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride);
dConvolution_KMxKN_forward2<
T, K, V><<<dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride);
return;
} else {
dConvolution_forward(inFeatures, outFeatures, w, rules, nHot, input_nPlanes,
input_stride, output_nPlanes, output_stride, stream);
input_stride, output_nPlanes, output_stride);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void
dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// M = gridDim.y == input_nPlanes / K
uInt N = (output_nPlanes + K - 1) / K;
uInt m = blockIdx.y;
Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
uInt KI = min(K, input_nPlanes - K * m);
Int KI = min(K, input_nPlanes - K * m);
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
__shared__ uInt R[K * 2];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -549,7 +547,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
uInt KO = min(K, output_nPlanes - K * n);
Int KO = min(K, output_nPlanes - K * n);
// Read w, reset dW
#pragma unroll
......@@ -559,7 +557,7 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
dW[v] = 0;
}
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int v = 0; v < V; v++) {
......@@ -613,23 +611,22 @@ dConvolution_KMxKN_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
template <typename T>
void dConvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dConvolution_KMxKN_backward_dW2<T, K, V> << <
dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
input_nPlanes, input_stride, output_nPlanes, output_stride);
dConvolution_KMxKN_backward_dW2<T, K, V><<<
dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
input_nPlanes, input_stride, output_nPlanes, output_stride);
return;
} else {
dConvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride, stream);
output_nPlanes, output_stride);
}
}
#endif /* GPU_CONVOLUTION_H */
#endif /* CUDA_CONVOLUTION_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "Deconvolution.h"
template <typename T, Int Dimension>
double cuda_Deconvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
double flops = 0;
if (bias.numel()) {
auto b = bias.data<T>();
for (Int i = 0; i < op; i += 32) {
Int blockDim = min((Int)32, op - i);
Int gridDim = min((Int)4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
nActive);
}
}
Int c = ip * op;
RULEBOOKITERATOR(
dDeconvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
return flops;
}
template <typename T, Int Dimension>
void cuda_Deconvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
......@@ -4,16 +4,16 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_DECONVOLUTION_H
#define GPU_DECONVOLUTION_H
#include "../SparseConvNet.h"
#ifndef CUDA_DECONVOLUTION_H
#define CUDA_DECONVOLUTION_H
#include "Convolution.h"
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
......@@ -22,17 +22,17 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt M = input_nPlanes / K;
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
uInt R0[V];
uInt R1[V];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -45,7 +45,7 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
......@@ -79,28 +79,28 @@ dDeconvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w, uInt *rules,
inFeatures += K;
}
}
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
uInt M = input_nPlanes / K;
Int M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
uInt n = blockIdx.y;
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
uInt R0[V];
uInt R1[V];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -113,7 +113,7 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
for (int v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
......@@ -156,27 +156,27 @@ dDeconvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w, uInt *rules,
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_forwardA<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), output_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules, o, input_nPlanes, \
input_stride, output_nPlanes, output_stride); \
dDeconvolution_KMxKN_forwardA< \
T, K, V><<<dim3(std::min(o / K, (Int)512), output_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules, o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_forwardB<T, K, V> << < \
dim3(1, output_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dDeconvolution_KMxKN_forwardB< \
T, K, \
V><<<dim3(1, output_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, outFeatures, w, rules + 2 * o, nHot - o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
......@@ -185,10 +185,9 @@ void dDeconvolution_forward(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template <>
void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
double *w, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
double *w, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
......@@ -199,14 +198,14 @@ void dDeconvolution_forward<double>(double *inFeatures, double *outFeatures,
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride) {
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
......@@ -217,8 +216,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
uInt R0[V];
uInt R1[V];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -233,7 +232,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
dW[v] = 0;
}
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
R1[v] = rules[2 * (s + ty[v])];
......@@ -275,14 +274,14 @@ __global__ void dDeconvolution_KMxKN_backward_dW_A(
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride) {
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) {
// M = gridDim.y == input_nPlanes / K
uInt N = output_nPlanes / K;
uInt m = blockIdx.y;
Int N = output_nPlanes / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
......@@ -293,8 +292,8 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
uInt R0[V];
uInt R1[V];
Int R0[V];
Int R1[V];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -309,7 +308,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
dW[v] = 0;
}
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
......@@ -359,29 +358,29 @@ __global__ void dDeconvolution_KMxKN_backward_dW_B(
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
uInt o = (nHot / K) * K; \
Int o = (nHot / K) * K; \
if (o >= K) \
dDeconvolution_KMxKN_backward_dW_A<T, K, V> << < \
dim3(std::min(o / K, (uInt)512), input_nPlanes / K), \
dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
dDeconvolution_KMxKN_backward_dW_A< \
T, K, V><<<dim3(std::min(o / K, (Int)512), input_nPlanes / K), \
dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dDeconvolution_KMxKN_backward_dW_B<T, K, V> << < \
dim3(1, input_nPlanes / K), dim3(K, K / V), 0, stream>>> \
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
dDeconvolution_KMxKN_backward_dW_B< \
T, K, \
V><<<dim3(1, input_nPlanes / K), dim3(K, K / V)>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rules + 2 * o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
......@@ -389,11 +388,11 @@ void dDeconvolution_backward_dW(T *inFeatures, T *dInFeatures, T *dOutFeatures,
}
#undef FOO
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void
dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride) {
dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,1) Volkov-blocks
// K is a multiple of V,
......@@ -401,17 +400,17 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
uInt M = (input_nPlanes + K - 1) / K;
Int M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K
uInt n = blockIdx.y;
Int n = blockIdx.y;
outFeatures += n * K;
w += n * K;
uInt KO = min(K, output_nPlanes - K * n);
Int KO = min(K, output_nPlanes - K * n);
T O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ uInt R[K * 2];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -419,7 +418,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
ty[v] = threadIdx.y + v * (K / V);
for (int m = 0; m < M; m++) {
uInt KI = min(K, input_nPlanes - K * m);
Int KI = min(K, input_nPlanes - K * m);
// Read w
#pragma unroll
......@@ -427,7 +426,7 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int v = 0; v < V; v++) {
......@@ -467,48 +466,47 @@ dDeconvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
}
template <typename T>
void dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
void dDeconvolution_forward2(T *inFeatures, T *outFeatures, T *w, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dDeconvolution_KMxKN_forward2<T, K, V> << <
dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
(inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride);
dDeconvolution_KMxKN_forward2<T, K, V><<<
dim3(128, (output_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, outFeatures, w, rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride);
return;
} else {
dDeconvolution_forward(inFeatures, outFeatures, w, rules, nHot,
input_nPlanes, input_stride, output_nPlanes,
output_stride, stream);
output_stride);
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,1)
template <typename T, uInt K, uInt V>
template <typename T, Int K, Int V>
__global__ void dDeconvolution_KMxKN_backward_dW2(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, uInt *rules,
uInt nHot, uInt input_nPlanes, uInt input_stride, uInt output_nPlanes,
uInt output_stride) {
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw, Int *rules,
Int nHot, Int input_nPlanes, Int input_stride, Int output_nPlanes,
Int output_stride) {
// M = gridDim.y == input_nPlanes / K
uInt N = (output_nPlanes + K - 1) / K;
uInt m = blockIdx.y;
Int N = (output_nPlanes + K - 1) / K;
Int m = blockIdx.y;
inFeatures += m * K;
dInFeatures += m * K;
w += m * K * output_nPlanes;
dw += m * K * output_nPlanes;
uInt KI = min(K, input_nPlanes - K * m);
Int KI = min(K, input_nPlanes - K * m);
T dI[V];
T dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
__shared__ uInt R[K * 2];
__shared__ Int R[K * 2];
const int tx = threadIdx.x;
int ty[V];
#pragma unroll
......@@ -516,7 +514,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
ty[v] = threadIdx.y + v * (K / V);
for (int n = 0; n < N; n++) {
uInt KO = min(K, output_nPlanes - K * n);
Int KO = min(K, output_nPlanes - K * n);
// Read w, reset dW
#pragma unroll
......@@ -526,7 +524,7 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
dW[v] = 0;
}
for (uInt s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
for (Int s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int v = 0; v < V; v++) {
......@@ -580,23 +578,22 @@ __global__ void dDeconvolution_KMxKN_backward_dW2(
template <typename T>
void dDeconvolution_backward_dW2(T *inFeatures, T *dInFeatures, T *dOutFeatures,
T *w, T *dw, uInt *rules, uInt nHot,
uInt input_nPlanes, uInt input_stride,
uInt output_nPlanes, uInt output_stride,
cudaStream_t stream) {
T *w, T *dw, Int *rules, Int nHot,
Int input_nPlanes, Int input_stride,
Int output_nPlanes, Int output_stride) {
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int K = 16;
const int V = 4;
dDeconvolution_KMxKN_backward_dW2<T, K, V> << <
dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V), 0, stream>>>
(inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
input_nPlanes, input_stride, output_nPlanes, output_stride);
dDeconvolution_KMxKN_backward_dW2<T, K, V><<<
dim3(128, (input_nPlanes + K - 1) / K), dim3(K, K / V)>>>(
inFeatures, dInFeatures, dOutFeatures, w, dw, rules, nHot,
input_nPlanes, input_stride, output_nPlanes, output_stride);
return;
} else {
dDeconvolution_backward_dW(inFeatures, dInFeatures, dOutFeatures, w, dw,
rules, nHot, input_nPlanes, input_stride,
output_nPlanes, output_stride, stream);
output_nPlanes, output_stride);
}
}
#endif /* GPU_DECONVOLUTION_H */
#endif /* CUDA_DECONVOLUTION_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "IOLayers.h"
template <typename T, Int Dimension>
void cuda_InputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
long batchSize, long mode) {
m.inputLayer(spatialSize, input_coords, batchSize, mode);
Int nPlanes = input_features.size(1);
auto &rules = m.inputLayerRuleBook;
Int maxActive = rules[0][1];
Int nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_InputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({rules[0][2], nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = input_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({rules[0][2], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
}
}
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
long mode) {
m.blLayer(spatialSize, input_coords, mode);
Int nPlanes = input_features.size(2);
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto &rules = m.blLayerRuleBook;
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({*m.inputNActive, nPlanes});
} else {
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = d_output_features.size(1);
Int mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_BLOutputLayer_updateOutput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = input_features.size(1);
auto mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
}
}
template <typename T, Int Dimension>
void cuda_BLOutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = d_output_features.size(2);
Int mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({nRows, nPlanes});
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
}
}
......@@ -4,21 +4,21 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_IOLAYERS_H
#define GPU_IOLAYERS_H
#ifndef CUDA_IOLAYERS_H
#define CUDA_IOLAYERS_H
template <typename T>
__global__ void InputLayer_fp(T *input_features, T *output_features,
uInt nRows, uInt maxActive, uInt nPlanes,
uInt *rules, bool average) {
Int nRows, Int maxActive, Int nPlanes,
Int *rules, bool average) {
for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
T *out = output_features + row * nPlanes;
uInt *r = rules + row * (1 + maxActive);
uInt nActive = r[0];
Int *r = rules + row * (1 + maxActive);
Int nActive = r[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (int i = 1; i <= nActive; i++) {
T *inp = input_features + r[i] * nPlanes;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
out[plane] += multiplier * inp[plane];
}
}
......@@ -26,18 +26,18 @@ __global__ void InputLayer_fp(T *input_features, T *output_features,
template <typename T>
__global__ void InputLayer_bp(T *d_input_features, T *d_output_features,
uInt nRows, uInt maxActive, uInt nPlanes,
uInt *rules, bool average) {
Int nRows, Int maxActive, Int nPlanes,
Int *rules, bool average) {
for (int row = blockIdx.x; row < nRows; row += gridDim.x) {
T *out = d_output_features + row * nPlanes;
uInt *r = rules + row * (1 + maxActive);
uInt nActive = r[0];
Int *r = rules + row * (1 + maxActive);
Int nActive = r[0];
T multiplier = (average and nActive > 0) ? 1.0f / nActive : 1.0f;
for (int i = 1; i <= nActive; i++) {
T *inp = d_input_features + r[i] * nPlanes;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
for (Int plane = threadIdx.x; plane < nPlanes; plane += blockDim.x)
atomicAdd(&inp[plane], multiplier * out[plane]);
}
}
}
#endif /* GPU_IOLAYERS_H */
#endif /* CUDA_IOLAYERS_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "LeakyReLU.h"
template <typename T>
void cuda_LeakyReLU_updateOutput(/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
float alpha) {
output_features.resize_as_(input_features);
auto n = input_features.numel();
LeakyReLU_fp<T><<<16, 1024>>>(input_features.data<T>(),
output_features.data<T>(), n, alpha);
}
template <typename T>
void cuda_LeakyReLU_updateGradInput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features, float alpha) {
d_input_features.resize_as_(d_output_features);
auto n = d_input_features.numel();
LeakyReLU_bp<T><<<16, 1024>>>(input_features.data<T>(),
d_input_features.data<T>(),
d_output_features.data<T>(), n, alpha);
}
......@@ -8,16 +8,16 @@
#define LEAKYRELU_H
template <typename T>
__global__ void LeakyReLU_fp(T *input_features, T *output_features, uInt n,
__global__ void LeakyReLU_fp(T *input_features, T *output_features, Int n,
T alpha) {
for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
output_features[i] = (input_features[i] > 0) ? input_features[i]
: (input_features[i] * alpha);
}
template <typename T>
__global__ void LeakyReLU_bp(T *input_features, T *d_input_features,
T *d_output_features, uInt n, T alpha) {
for (uInt i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
T *d_output_features, Int n, T alpha) {
for (Int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += 16 * 1024)
d_input_features[i] = (input_features[i] > 0)
? d_output_features[i]
: (d_output_features[i] * alpha);
......
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "MaxPooling.h"
#include "RuleBookIterator.h"
template <typename T, Int Dimension>
void cuda_MaxPooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, nPlanes});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
RULEBOOKITERATOR(
cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
output_features.size(1), rbB, nHotB);
, )
}
template <typename T, Int Dimension>
void cuda_MaxPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(inputSize, outputSize, poolSize, poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
RULEBOOKITERATOR(cuda_MaxPooling_BackwardPass<T>(
iF, diF, oF, doF, nPlanes, input_features.size(1),
d_output_features.size(1), rbB, nHotB);
, )
}
template <typename T, Int Dimension>
void cuda_RandomizedStrideMaxPooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, nPlanes});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
RULEBOOKITERATOR(
cuda_MaxPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
output_features.size(1), rbB, nHotB);
, )
}
template <typename T, Int Dimension>
void cuda_RandomizedStrideMaxPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, poolSize,
poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
RULEBOOKITERATOR(cuda_MaxPooling_BackwardPass<T>(
iF, diF, oF, doF, nPlanes, input_features.size(1),
d_output_features.size(1), rbB, nHotB);
, )
}
......@@ -4,26 +4,26 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_MAXPOOLING_H
#define GPU_MAXPOOLING_H
#ifndef CUDA_MAXPOOLING_H
#define CUDA_MAXPOOLING_H
// NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void MaxPooling_fp(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y] * input_stride;
uInt o = r[2 * threadIdx.y + 1] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX) {
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX) {
T inp = input_features[i + plane];
if (output_features[o + plane] < inp)
output_features[o + plane] = inp;
......@@ -34,30 +34,30 @@ __global__ void MaxPooling_fp(T *input_features, T *output_features,
}
template <typename T>
void MaxPooling_ForwardPass(cudaStream_t stream, T *input_features,
T *output_features, uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
MaxPooling_fp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
(input_features, output_features, nPlanes, input_stride, output_stride,
rules, nHot);
void cuda_MaxPooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
MaxPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, output_features, nPlanes, input_stride, output_stride,
rules, nHot);
}
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void MaxPooling_bp(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
uInt nPlanes, uInt input_stride,
uInt output_stride, uInt *rules, uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
Int nPlanes, Int input_stride, Int output_stride,
Int *rules, Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
uInt i = r[2 * threadIdx.y] * input_stride;
uInt o = r[2 * threadIdx.y + 1] * output_stride;
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
Int i = r[2 * threadIdx.y] * input_stride;
Int o = r[2 * threadIdx.y + 1] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
if (output_features[o + plane] == input_features[i + plane])
d_input_features[i + plane] += d_output_features[o + plane];
}
......@@ -66,13 +66,12 @@ __global__ void MaxPooling_bp(T *input_features, T *d_input_features,
}
template <typename T>
void MaxPooling_BackwardPass(cudaStream_t stream, T *input_features,
T *d_input_features, T *output_features,
T *d_output_features, uInt nPlanes,
uInt input_stride, uInt output_stride, uInt *rules,
uInt nHot) {
MaxPooling_bp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
(input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, rules, nHot);
void cuda_MaxPooling_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
MaxPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, rules, nHot);
}
#endif /* GPU_MAXPOOLING_H */
#endif /* CUDA_MAXPOOLING_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include <algorithm>
template <typename T>
double cuda_NetworkInNetwork_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
auto nActive = input_features.size(0);
auto input_nPlanes = weight.size(0);
auto output_nPlanes = weight.size(1);
output_features.resize_({nActive, input_nPlanes});
if (bias.numel())
output_features.copy_(bias);
else
output_features.zero_();
output_features.addmm(input_features, weight);
return nActive * input_nPlanes * output_nPlanes;
}
template <typename T>
void cuda_NetworkInNetwork_updateGradInput(
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight) {
d_input_features.resize_({(int)d_output_features.size(0), weight.size(0)});
d_input_features.zero_();
at::mm_out(d_input_features, d_output_features, weight.t());
}
template <typename T>
void cuda_NetworkInNetwork_accGradParameters(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor d_weight, /*cuda float*/ at::Tensor d_bias) {
auto nActive = input_features.size(0);
if (nActive and d_bias.numel())
at::sum_out(d_bias, d_output_features, {0}, false);
at::mm_out(d_weight, input_features.t(), d_output_features);
}
......@@ -4,32 +4,31 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_RULEBOOKITERATOR_H
#define GPU_RULEBOOKITERATOR_H
#ifndef CUDA_RULEBOOKITERATOR_H
#define CUDA_RULEBOOKITERATOR_H
// Macro to parallelize loading rulebook elements to GPU memory and operating
// Macro to parallelize loading rulebook elements to CUDA memory and operating
// on the elements of the rulebook.
// X is the function to apply.
// Y is a command to run
#define RULEBOOKITERATOR(X, Y) \
uInt ms = ruleBookMaxSize(_rules); \
auto rulesBuffer = THCITensor_(new)(state); \
if (THCITensor_(nElement)(state, rulesBuffer) < ms) \
THCITensor_(resize1d)(state, rulesBuffer, ms); \
uInt *rbB = (uInt *)THCITensor_(data)(state, rulesBuffer); \
for (int k = 0; k < _rules.size(); ++k) { \
auto &r = _rules[k]; \
uInt nHotB = r.size() / 2; \
if (nHotB) { \
cudaMemcpy(rbB, &r[0], sizeof(uInt) * 2 * nHotB, \
cudaMemcpyHostToDevice); \
{ \
Int rbMaxSize = 0; \
for (auto &r : _rules) \
rbMaxSize = std::max(rbMaxSize, (Int)r.size()); \
at::Tensor rulesBuffer = at::CUDA(at_kINT).tensor({rbMaxSize}); \
Int *rbB = rulesBuffer.data<Int>(); \
for (int k = 0; k < _rules.size(); ++k) { \
auto &r = _rules[k]; \
Int nHotB = r.size() / 2; \
if (nHotB) { \
cudaMemcpy(rbB, &r[0], sizeof(Int) * 2 * nHotB, \
cudaMemcpyHostToDevice); \
X \
} \
Y \
} \
if (nHotB) { \
X \
} \
Y \
} \
THCITensor_(free)(state, rulesBuffer);
}
#endif /* GPU_RULEBOOKITERATOR_H */
#endif /* CUDA_RULEBOOKITERATOR_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "SparseToDense.h"
template <typename T, Int Dimension>
void cuda_SparseToDense_updateOutput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, long nPlanes) {
{
std::array<long, Dimension + 2> sz;
sz[0] = m.grids.begin()->second.size(); // batch size
sz[1] = nPlanes;
long *in_sz = inputSize.data<long>();
for (Int i = 0; i < Dimension; ++i)
sz[i + 2] = in_sz[i];
output_features.resize_(sz);
output_features.zero_();
}
if (input_features.ndimension() == 2) {
auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
Int _nPlanes = input_features.size(1);
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
long spatialVolume = inputSize.prod().data<long>()[0];
RULEBOOKITERATOR(SparseToDense_ForwardPass<T>( iF, oF, _nPlanes,
spatialVolume, rbB, nHotB);
, oF += _nPlanes * spatialVolume;)
}
}
template <typename T, Int Dimension>
void cuda_SparseToDense_updateGradInput(
/*long*/ at::Tensor inputSize, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (input_features.ndimension() == 2) {
auto _rules = m.getSparseToDenseRuleBook(inputSize, true);
long spatialVolume = inputSize.prod().data<long>()[0];
Int _nPlanes = d_input_features.size(1);
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
RULEBOOKITERATOR(SparseToDense_BackwardPass<T>( diF, doF, _nPlanes,
spatialVolume, rbB, nHotB);
, doF += _nPlanes * spatialVolume;)
}
}
......@@ -4,28 +4,27 @@
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef GPU_SPARSETODENSE_H
#define GPU_SPARSETODENSE_H
#include "../SparseConvNet.h"
//#include <THC/THCAtomics.cuh>
#ifndef CUDA_SPARSETODENSE_H
#define CUDA_SPARSETODENSE_H
// NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void SparseToDense_fp(T *input_features, T *output_features,
uInt nPlanes, uInt spatialVolume, uInt *rules,
uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
Int nPlanes, Int spatialVolume, Int *rules,
Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
T *i = input_features + r[2 * threadIdx.y] * nPlanes;
T *o = output_features + r[2 * threadIdx.y + 1];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
o[plane * spatialVolume] = i[plane];
}
__syncthreads();
......@@ -33,29 +32,29 @@ __global__ void SparseToDense_fp(T *input_features, T *output_features,
}
template <typename T>
void SparseToDense_ForwardPass(cudaStream_t stream, T *input_features,
T *output_features, uInt nPlanes,
uInt spatialVolume, uInt *rules, uInt nHot) {
SparseToDense_fp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
(input_features, output_features, nPlanes, spatialVolume, rules, nHot);
void SparseToDense_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int spatialVolume, Int *rules,
Int nHot) {
SparseToDense_fp<T, 32, 32><<<32, dim3(32, 32)>>>(
input_features, output_features, nPlanes, spatialVolume, rules, nHot);
}
// NTX must be >=2 so r is filled properly
template <typename T, uInt NTX, uInt NTY>
template <typename T, Int NTX, Int NTY>
__global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
uInt nPlanes, uInt spatialVolume, uInt *rules,
uInt nHot) {
__shared__ uInt r[NTY * 2];
for (uInt n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
Int nPlanes, Int spatialVolume, Int *rules,
Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
uInt i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (n - nHot))
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
T *d_i = d_input_features + r[2 * threadIdx.y] * nPlanes;
T *d_o = d_output_features + r[2 * threadIdx.y + 1];
for (uInt plane = threadIdx.x; plane < nPlanes; plane += NTX)
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
d_i[plane] = d_o[plane * spatialVolume];
}
__syncthreads();
......@@ -63,11 +62,10 @@ __global__ void SparseToDense_bp(T *d_input_features, T *d_output_features,
}
template <typename T>
void SparseToDense_BackwardPass(cudaStream_t stream, T *d_input_features,
T *d_output_features, uInt nPlanes,
uInt spatialVolume, uInt *rules, uInt nHot) {
SparseToDense_bp<T, 32, 32> << <32, dim3(32, 32), 0, stream>>>
(d_input_features, d_output_features, nPlanes, spatialVolume, rules,
nHot);
void SparseToDense_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int spatialVolume, Int *rules,
Int nHot) {
SparseToDense_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
d_input_features, d_output_features, nPlanes, spatialVolume, rules, nHot);
}
#endif /* GPU_SPARSETODENSE_H */
#endif /* CUDA_SPARSETODENSE_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "RuleBookIterator.h"
#include "UnPooling.h"
template <typename T, Int Dimension>
void cuda_UnPooling_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, input_features.size(1) - nFeaturesToDrop});
output_features.zero_();
auto iF = input_features.data<T>() + nFeaturesToDrop;
auto oF = output_features.data<T>();
RULEBOOKITERATOR(
cuda_UnPooling_ForwardPass<T>(iF, oF, nPlanes, input_features.size(1),
output_features.size(1), rbB, nHotB);
, )
}
template <typename T, Int Dimension>
void cuda_UnPooling_updateGradInput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor poolSize,
/*long*/ at::Tensor poolStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features, long nFeaturesToDrop) {
Int nPlanes = input_features.size(1) - nFeaturesToDrop;
auto _rules =
m.getRuleBook(outputSize, inputSize, poolSize, poolStride, true);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto diF = d_input_features.data<T>() + nFeaturesToDrop;
auto doF = d_output_features.data<T>();
RULEBOOKITERATOR(
cuda_UnPooling_BackwardPass<T>(diF, doF, nPlanes, input_features.size(1),
d_output_features.size(1), rbB, nHotB);
, )
}
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#ifndef CUDA_UNPOOLING_H
#define CUDA_UNPOOLING_H
// NTX must be >=2 so r is filled properly
template <typename T, Int NTX, Int NTY>
__global__ void UnPooling_fp(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y + 1] * input_stride;
Int o = r[2 * threadIdx.y] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
output_features[o + plane] += input_features[i + plane];
}
__syncthreads();
}
}
template <typename T>
void cuda_UnPooling_ForwardPass(T *input_features, T *output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
UnPooling_fp<T, 32, 32><<<32, dim3(32, 32)>>>(input_features, output_features,
nPlanes, input_stride,
output_stride, rules, nHot);
}
template <typename T, Int NTX, Int NTY>
__global__ void UnPooling_bp(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
__shared__ Int r[NTY * 2];
for (Int n = blockIdx.x * NTY; n < nHot; n += gridDim.x * NTY) {
{
Int i = threadIdx.x + NTX * threadIdx.y;
if (i < NTY * 2 and i < 2 * (nHot - n))
r[i] = rules[2 * n + i];
}
__syncthreads();
if (n + threadIdx.y < nHot) {
Int i = r[2 * threadIdx.y + 1] * input_stride;
Int o = r[2 * threadIdx.y] * output_stride;
for (Int plane = threadIdx.x; plane < nPlanes; plane += NTX)
d_input_features[i + plane] += d_output_features[o + plane];
}
__syncthreads();
}
}
template <typename T>
void cuda_UnPooling_BackwardPass(T *d_input_features, T *d_output_features,
Int nPlanes, Int input_stride,
Int output_stride, Int *rules, Int nHot) {
UnPooling_bp<T, 32, 32><<<32, dim3(32, 32)>>>(
d_input_features, d_output_features, nPlanes, input_stride, output_stride,
rules, nHot);
}
#endif /* CUDA_UNPOOLING_H */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment