Commit 2c4ed608 authored by Benjamin Thomas Graham's avatar Benjamin Thomas Graham
Browse files

Goodbye THNN. Hello ATen!

parent 6d4475db
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "BatchNormalization.h"
#define BN_F_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_ForwardPass<T, N, 64>( \
input_features.data<T>(), output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), eps, momentum, \
train, leakiness); \
}
template <typename T>
void cuda_BatchNormalization_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor saveMean,
/*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
/*cuda float*/ at::Tensor runningVar,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
T momentum, bool train, T leakiness) {
output_features.resize_as_(input_features);
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
}
template <typename T>
void cuda_BatchNormalizationInTensor_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor saveMean,
/*cuda float*/ at::Tensor saveInvStd, /*cuda float*/ at::Tensor runningMean,
/*cuda float*/ at::Tensor runningVar,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias, T eps,
T momentum, bool train, T leakiness) {
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
BN_F_MACRO(16)
else BN_F_MACRO(12) else BN_F_MACRO(8) else BN_F_MACRO(4) else BN_F_MACRO(1)
}
}
#undef BN_F_MACRO
#define BN_B_MACRO(N) \
if (nPlanes % N == 0) { \
BatchNormalization_BackwardPass<T, N, 64>( \
input_features.data<T>(), d_input_features.data<T>(), \
output_features.data<T>(), d_output_features.data<T>(), nPlanes, \
input_stride, output_stride, nActive, saveMean.data<T>(), \
saveInvStd.data<T>(), runningMean.data<T>(), runningVar.data<T>(), \
OptionalTensorData<T>(weight), OptionalTensorData<T>(bias), \
OptionalTensorData<T>(d_weight), OptionalTensorData<T>(d_bias), leakiness); \
}
template <typename T>
void cuda_BatchNormalization_backward(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor saveMean, /*cuda float*/ at::Tensor saveInvStd,
/*cuda float*/ at::Tensor runningMean,
/*cuda float*/ at::Tensor runningVar, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias,
/*cuda float*/ at::Tensor d_weight, /*cuda float*/ at::Tensor d_bias,
T leakiness) {
d_input_features.resize_as_(d_output_features);
if (input_features.ndimension() == 2) {
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
auto input_stride = input_features.stride(0);
auto output_stride = output_features.stride(0);
BN_B_MACRO(16)
else BN_B_MACRO(12) else BN_B_MACRO(8) else BN_B_MACRO(4) else BN_B_MACRO(1)
}
}
...@@ -4,9 +4,9 @@ ...@@ -4,9 +4,9 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHNORMALIZATION_H #ifndef CUDA_BATCHNORMALIZATION_H
#define GPU_BATCHNORMALIZATION_H #define CUDA_BATCHNORMALIZATION_H
#include "../SparseConvNet.h"
#include <cassert> #include <cassert>
// input_stride and output_stride are normally the same as nPlanes; allow larger // input_stride and output_stride are normally the same as nPlanes; allow larger
...@@ -14,22 +14,22 @@ ...@@ -14,22 +14,22 @@
// NTX ~ 16 - nPlanes must be a multiple of this // NTX ~ 16 - nPlanes must be a multiple of this
// NTY ~ 64 - at least 4 // NTY ~ 64 - at least 4
template <typename T, uInt NTX, uInt NTY> template <typename T, Int NTX, Int NTY>
__global__ void __global__ void
BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes, BatchNormalization_f_train(T *input_features, T *output_features, Int nPlanes,
uInt input_stride, uInt output_stride, uInt nActive, Int input_stride, Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean, T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum, T *runningVar, T *weight, T *bias, T eps, T momentum,
T leakiness) { T leakiness) {
__shared__ T t[NTY][NTX]; __shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX]; __shared__ T t2[NTY][NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes; for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) { plane += gridDim.x * NTX) {
t[threadIdx.y][threadIdx.x] = 0; t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0; t2[threadIdx.y][threadIdx.x] = 0;
for (uInt row = threadIdx.y, c = plane + threadIdx.y * input_stride; for (Int row = threadIdx.y, c = plane + threadIdx.y * input_stride;
row < nActive; row += NTY, c += input_stride * NTY) { row < nActive; row += NTY, c += input_stride * NTY) {
T i = input_features[c]; T i = input_features[c];
t[threadIdx.y][threadIdx.x] += i; t[threadIdx.y][threadIdx.x] += i;
...@@ -38,7 +38,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes, ...@@ -38,7 +38,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads(); __syncthreads();
T _saveMean = 0; T _saveMean = 0;
T _saveInvStd = 0; T _saveInvStd = 0;
for (uInt row = 0; row < NTY; row++) { for (Int row = 0; row < NTY; row++) {
_saveMean += t[row][threadIdx.x]; _saveMean += t[row][threadIdx.x];
_saveInvStd += t2[row][threadIdx.x]; _saveInvStd += t2[row][threadIdx.x];
} }
...@@ -65,7 +65,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes, ...@@ -65,7 +65,7 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
T W = t[0][threadIdx.x]; T W = t[0][threadIdx.x];
T B = t[1][threadIdx.x]; T B = t[1][threadIdx.x];
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride, for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride; co = plane + threadIdx.y * output_stride;
row < nActive; row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) { row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
...@@ -75,16 +75,16 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes, ...@@ -75,16 +75,16 @@ BatchNormalization_f_train(T *input_features, T *output_features, uInt nPlanes,
__syncthreads(); __syncthreads();
} }
} }
template <typename T, uInt NTX, uInt NTY> template <typename T, Int NTX, Int NTY>
__global__ void __global__ void
BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes, BatchNormalization_f_test(T *input_features, T *output_features, Int nPlanes,
uInt input_stride, uInt output_stride, uInt nActive, Int input_stride, Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean, T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T momentum, T *runningVar, T *weight, T *bias, T eps, T momentum,
T leakiness) { T leakiness) {
__shared__ T W[NTX]; __shared__ T W[NTX];
__shared__ T B[NTX]; __shared__ T B[NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes; for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) { plane += gridDim.x * NTX) {
if (threadIdx.y == 0) { if (threadIdx.y == 0) {
W[threadIdx.x] = W[threadIdx.x] =
...@@ -95,7 +95,7 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes, ...@@ -95,7 +95,7 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
__syncthreads(); __syncthreads();
float w = W[threadIdx.x], b = B[threadIdx.x]; float w = W[threadIdx.x], b = B[threadIdx.x];
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride, for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride; co = plane + threadIdx.y * output_stride;
row < nActive; row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) { row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
...@@ -106,40 +106,38 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes, ...@@ -106,40 +106,38 @@ BatchNormalization_f_test(T *input_features, T *output_features, uInt nPlanes,
} }
} }
template <typename T, uInt NTX, uInt NTY> template <typename T, Int NTX, Int NTY>
void BatchNormalization_ForwardPass(T *input_features, T *output_features, void BatchNormalization_ForwardPass(T *input_features, T *output_features,
uInt nPlanes, uInt input_stride, Int nPlanes, Int input_stride,
uInt output_stride, uInt nActive, Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean, T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T eps, T *runningVar, T *weight, T *bias, T eps,
T momentum, bool train, T leakiness) { T momentum, bool train, T leakiness) {
if (train) { if (train) {
BatchNormalization_f_train< BatchNormalization_f_train<
T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0, T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
THCState_getCurrentStream(state)>>>(
input_features, output_features, nPlanes, input_stride, output_stride, input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias, nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness); eps, momentum, leakiness);
} else { } else {
BatchNormalization_f_test< BatchNormalization_f_test<
T, NTX, NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), 0, T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
THCState_getCurrentStream(state)>>>(
input_features, output_features, nPlanes, input_stride, output_stride, input_features, output_features, nPlanes, input_stride, output_stride,
nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias, nActive, saveMean, saveInvStd, runningMean, runningVar, weight, bias,
eps, momentum, leakiness); eps, momentum, leakiness);
} }
} }
template <typename T, uInt NTX, uInt NTY> template <typename T, Int NTX, Int NTY>
__global__ void __global__ void
BatchNormalization_b(T *input_features, T *d_input_features, T *output_features, BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T *d_output_features, uInt nPlanes, uInt input_stride, T *d_output_features, Int nPlanes, Int input_stride,
uInt output_stride, uInt nActive, T *saveMean, Int output_stride, Int nActive, T *saveMean,
T *saveInvStd, T *runningMean, T *runningVar, T *weight, T *saveInvStd, T *runningMean, T *runningVar, T *weight,
T *bias, T *d_weight, T *d_bias, T leakiness) { T *bias, T *d_weight, T *d_bias, T leakiness) {
__shared__ T t[NTY][NTX]; __shared__ T t[NTY][NTX];
__shared__ T t2[NTY][NTX]; __shared__ T t2[NTY][NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes; for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) { plane += gridDim.x * NTX) {
if (threadIdx.y == 0) { if (threadIdx.y == 0) {
t[0][threadIdx.x] = saveMean[plane]; t[0][threadIdx.x] = saveMean[plane];
...@@ -153,7 +151,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features, ...@@ -153,7 +151,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
__syncthreads(); __syncthreads();
t[threadIdx.y][threadIdx.x] = 0; t[threadIdx.y][threadIdx.x] = 0;
t2[threadIdx.y][threadIdx.x] = 0; t2[threadIdx.y][threadIdx.x] = 0;
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride, for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride; co = plane + threadIdx.y * output_stride;
row < nActive; row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) { row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
...@@ -180,7 +178,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features, ...@@ -180,7 +178,7 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
T k = dotp * _saveInvStd * _saveInvStd / nActive; T k = dotp * _saveInvStd * _saveInvStd / nActive;
for (uInt row = threadIdx.y, ci = plane + threadIdx.y * input_stride, for (Int row = threadIdx.y, ci = plane + threadIdx.y * input_stride,
co = plane + threadIdx.y * output_stride; co = plane + threadIdx.y * output_stride;
row < nActive; row < nActive;
row += NTY, ci += input_stride * NTY, co += output_stride * NTY) { row += NTY, ci += input_stride * NTY, co += output_stride * NTY) {
...@@ -192,17 +190,16 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features, ...@@ -192,17 +190,16 @@ BatchNormalization_b(T *input_features, T *d_input_features, T *output_features,
} }
} }
template <typename T, uInt NTX, uInt NTY> template <typename T, Int NTX, Int NTY>
void BatchNormalization_BackwardPass(T *input_features, T *d_input_features, void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
T *output_features, T *d_output_features, T *output_features, T *d_output_features,
uInt nPlanes, uInt input_stride, Int nPlanes, Int input_stride,
uInt output_stride, uInt nActive, Int output_stride, Int nActive,
T *saveMean, T *saveInvStd, T *runningMean, T *saveMean, T *saveInvStd, T *runningMean,
T *runningVar, T *weight, T *bias, T *runningVar, T *weight, T *bias,
T *d_weight, T *d_bias, T leakiness) { T *d_weight, T *d_bias, T leakiness) {
BatchNormalization_b<T, NTX, BatchNormalization_b<
NTY><<<std::min((uInt)16, nPlanes / NTX), dim3(NTX, NTY), T, NTX, NTY><<<std::min((Int)16, nPlanes / NTX), dim3(NTX, NTY)>>>(
0, THCState_getCurrentStream(state)>>>(
input_features, d_input_features, output_features, d_output_features, input_features, d_input_features, output_features, d_output_features,
nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd, nPlanes, input_stride, output_stride, nActive, saveMean, saveInvStd,
runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness); runningMean, runningVar, weight, bias, d_weight, d_bias, leakiness);
...@@ -210,4 +207,4 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features, ...@@ -210,4 +207,4 @@ void BatchNormalization_BackwardPass(T *input_features, T *d_input_features,
#undef NTX #undef NTX
#undef NTY #undef NTY
#endif /* GPU_BATCHNORMALIZATION_H */ #endif /* CUDA_BATCHNORMALIZATION_H */
...@@ -4,32 +4,28 @@ ...@@ -4,32 +4,28 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/GPU/BatchwiseMultiplicativeDropout.cu"
#else
#include "BatchwiseMultiplicativeDropout.h" #include "BatchwiseMultiplicativeDropout.h"
#define SPARSECONVNET_FOO(NTX, NTY) \ #define SPARSECONVNET_FOO(NTX, NTY) \
{ \ { \
if (nPlanes % NTX == 0) { \ if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_fp<real, NTX, NTY> << < \ BatchwiseMultiplicativeDropout_fp< \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \ T, NTX, \
THCState_getCurrentStream(state)>>> \ NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
(THCTensor_(data)(state, input_features), \ input_features.data<T>(), output_features.data<T>(), \
THCTensor_(data)(state, output_features), \ noise.data<T>(), nActive, nPlanes, nPlanes, nPlanes, alpha); \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \ return; \
} \ } \
} }
extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)( template <typename T>
THCTensor *input_features, THCTensor *output_features, THCTensor *noise, void cuda_BatchwiseMultiplicativeDropout_updateOutput(
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor noise,
float alpha) { float alpha) {
if (input_features != output_features) output_features.resize_as_(input_features);
THCTensor_(resizeAs)(state, output_features, input_features); auto nActive = input_features.size(0);
auto nActive = input_features->size[0]; auto nPlanes = input_features.size(1);
auto nPlanes = input_features->size[1];
SPARSECONVNET_FOO(32, 32) SPARSECONVNET_FOO(32, 32)
SPARSECONVNET_FOO(24, 32) SPARSECONVNET_FOO(24, 32)
SPARSECONVNET_FOO(16, 64) SPARSECONVNET_FOO(16, 64)
...@@ -43,24 +39,24 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)( ...@@ -43,24 +39,24 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateOutput)(
#define SPARSECONVNET_FOO(NTX, NTY) \ #define SPARSECONVNET_FOO(NTX, NTY) \
{ \ { \
if (nPlanes % NTX == 0) { \ if (nPlanes % NTX == 0) { \
BatchwiseMultiplicativeDropout_bp<real, NTX, NTY> << < \ BatchwiseMultiplicativeDropout_bp< \
dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY), 0, \ T, NTX, \
THCState_getCurrentStream(state)>>> \ NTY><<<dim3(std::min(16L, nPlanes / NTX), 16), dim3(NTX, NTY)>>>( \
(THCTensor_(data)(state, input_features), \ input_features.data<T>(), d_input_features.data<T>(), \
THCTensor_(data)(state, d_input_features), \ d_output_features.data<T>(), noise.data<T>(), nActive, nPlanes, \
THCTensor_(data)(state, d_output_features), \ nPlanes, nPlanes, alpha); \
THCTensor_(data)(state, noise), nActive, nPlanes, nPlanes, nPlanes, \
alpha); \
return; \ return; \
} \ } \
} }
extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)( template <typename T>
THCTensor *input_features, THCTensor *d_input_features, void cuda_BatchwiseMultiplicativeDropout_updateGradInput(
THCTensor *d_output_features, THCTensor *noise, float alpha) { /*cuda float*/ at::Tensor input_features,
if (d_input_features != d_output_features) /*cuda float*/ at::Tensor d_input_features,
THCTensor_(resizeAs)(state, d_input_features, d_output_features); /*cuda float*/ at::Tensor d_output_features,
auto nActive = input_features->size[0]; /*cuda float*/ at::Tensor noise, float alpha) {
auto nPlanes = input_features->size[1]; d_input_features.resize_as_(d_output_features);
auto nActive = input_features.size(0);
auto nPlanes = input_features.size(1);
SPARSECONVNET_FOO(32, 32) SPARSECONVNET_FOO(32, 32)
SPARSECONVNET_FOO(24, 32) SPARSECONVNET_FOO(24, 32)
...@@ -71,5 +67,3 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)( ...@@ -71,5 +67,3 @@ extern "C" void scn_R_(BatchwiseMultiplicativeDropout_updateGradInput)(
SPARSECONVNET_FOO(1, 64) SPARSECONVNET_FOO(1, 64)
} }
#undef SPARSECONVNET_FOO #undef SPARSECONVNET_FOO
#endif
...@@ -4,50 +4,50 @@ ...@@ -4,50 +4,50 @@
// This source code is licensed under the license found in the // This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree. // LICENSE file in the root directory of this source tree.
#ifndef GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H #ifndef CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
#define GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H #define CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H
template <typename T, uInt NTX, uInt NTY> template <typename T, Int NTX, Int NTY>
__global__ void BatchwiseMultiplicativeDropout_fp(T *input_features, __global__ void BatchwiseMultiplicativeDropout_fp(T *input_features,
T *output_features, T *noise, T *output_features, T *noise,
uInt nActive, uInt nPlanes, Int nActive, Int nPlanes,
uInt input_stride, Int input_stride,
uInt output_stride, T alpha) { Int output_stride, T alpha) {
__shared__ T nz[NTX]; __shared__ T nz[NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes; for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) { plane += gridDim.x * NTX) {
if (threadIdx.y == 0) if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane]; nz[threadIdx.x] = noise[plane];
__syncthreads(); __syncthreads();
for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive; for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) { row += gridDim.y * NTY) {
uInt i = row * input_stride + plane; Int i = row * input_stride + plane;
uInt o = row * output_stride + plane; Int o = row * output_stride + plane;
output_features[o] = input_features[i] * nz[threadIdx.x] * output_features[o] = input_features[i] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha); ((input_features[i] > 0) ? 1 : alpha);
} }
__syncthreads(); __syncthreads();
} }
} }
template <typename T, uInt NTX, uInt NTY> template <typename T, Int NTX, Int NTY>
__global__ void __global__ void
BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features, BatchwiseMultiplicativeDropout_bp(T *input_features, T *d_input_features,
T *d_output_features, T *noise, uInt nActive, T *d_output_features, T *noise, Int nActive,
uInt nPlanes, uInt input_stride, Int nPlanes, Int input_stride,
uInt output_stride, T alpha) { Int output_stride, T alpha) {
__shared__ T nz[NTX]; __shared__ T nz[NTX];
for (uInt plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes; for (Int plane = threadIdx.x + blockIdx.x * NTX; plane < nPlanes;
plane += gridDim.x * NTX) { plane += gridDim.x * NTX) {
if (threadIdx.y == 0) if (threadIdx.y == 0)
nz[threadIdx.x] = noise[plane]; nz[threadIdx.x] = noise[plane];
__syncthreads(); __syncthreads();
for (uInt row = threadIdx.y + blockIdx.y * NTY; row < nActive; for (Int row = threadIdx.y + blockIdx.y * NTY; row < nActive;
row += gridDim.y * NTY) { row += gridDim.y * NTY) {
uInt i = row * input_stride + plane; Int i = row * input_stride + plane;
uInt o = row * output_stride + plane; Int o = row * output_stride + plane;
d_input_features[i] = d_output_features[o] * nz[threadIdx.x] * d_input_features[i] = d_output_features[o] * nz[threadIdx.x] *
((input_features[i] > 0) ? 1 : alpha); ((input_features[i] > 0) ? 1 : alpha);
} }
__syncthreads(); __syncthreads();
} }
} }
#endif /* GPU_BATCHWISEMULTIPLICATIVEDROPOUT_H */ #endif /* CUDA_BATCHWISEMULTIPLICATIVEDROPOUT_H */
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "RuleBookIterator.h"
template <typename T, Int Dimension>
double cuda_Convolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
double flops = 0;
if (nActive) {
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
if (bias.numel()) {
auto b = bias.data<T>();
for (Int i = 0; i < op; i += 32) {
Int blockDim = min((Int)32, op - i);
Int gridDim = min((Int)4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
nActive);
}
}
Int c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T, Int Dimension>
void cuda_Convolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(inputSize, outputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive) {
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
template <typename T, Int Dimension>
double cuda_SubmanifoldConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
output_features.resize_({nActive, weight.size(2)});
if (bias.numel() and nActive)
output_features.copy_(bias);
else
output_features.zero_();
double flops = 0;
if (nActive) {
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
// if (bias.numel()) {
// auto b = bias.data<T>();
// for (Int i = 0; i < op; i += 32) {
// Int blockDim = min((Int)32, op - i);
// Int gridDim = min((Int)4096, nActive);
// Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
// nActive);
// }
// }
Int c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T, Int Dimension>
void cuda_SubmanifoldConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor filterSize,
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = m.getSubmanifoldRuleBook(inputSize, filterSize, true);
Int nActive = m.getNActive(inputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive) {
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
template <typename T, Int Dimension>
double cuda_FullConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActive = mOut.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
double flops = 0;
if (nActive) {
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
if (bias.numel()) {
auto b = bias.data<T>();
for (Int i = 0; i < op; i += 32) {
Int blockDim = min((Int)32, op - i);
Int gridDim = min((Int)4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
nActive);
}
}
Int c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T, Int Dimension>
void cuda_FullConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &mIn,
Metadata<Dimension> &mOut,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = mIn.getFullConvolutionRuleBook(inputSize, outputSize,
filterSize, filterStride, mOut);
Int nActive = mOut.getNActive(outputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive) {
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
template <typename T, Int Dimension>
double cuda_RandomizedStrideConvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
double flops = 0;
if (nActive) {
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
if (bias.numel()) {
auto b = bias.data<T>();
for (Int i = 0; i < op; i += 32) {
Int blockDim = min((Int)32, op - i);
Int gridDim = min((Int)4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
nActive);
}
}
Int c = ip * op;
RULEBOOKITERATOR(
dConvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
}
return flops;
}
template <typename T, Int Dimension>
void cuda_RandomizedStrideConvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules = m.getRandomizedStrideRuleBook(inputSize, outputSize, filterSize,
filterStride, true);
Int nActive = m.getNActive(outputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
if (nActive) {
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dConvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
}
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "Convolution.h"
#include "Deconvolution.h"
template <typename T, Int Dimension>
double cuda_Deconvolution_updateOutput(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features, /*cuda float*/ at::Tensor weight,
/*cuda float*/ at::Tensor bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
output_features.resize_({nActive, weight.size(2)});
if (not bias.numel())
output_features.zero_();
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int ip = input_features.size(1);
Int op = output_features.size(1);
auto w = weight.data<T>();
double flops = 0;
if (bias.numel()) {
auto b = bias.data<T>();
for (Int i = 0; i < op; i += 32) {
Int blockDim = min((Int)32, op - i);
Int gridDim = min((Int)4096, nActive);
Convolution_fp_bias<<<gridDim, blockDim>>>(oF + i, b + i, op, op,
nActive);
}
}
Int c = ip * op;
RULEBOOKITERATOR(
dDeconvolution_forward2<T>(iF, oF, w, rbB, nHotB, ip, ip, op, op);
, w += c; flops += nHotB * c;)
return flops;
}
template <typename T, Int Dimension>
void cuda_Deconvolution_backward(
/*long*/ at::Tensor inputSize, /*long*/ at::Tensor outputSize,
/*long*/ at::Tensor filterSize,
/*long*/ at::Tensor filterStride, Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features,
/*cuda float*/ at::Tensor weight, /*cuda float*/ at::Tensor d_weight,
/*cuda float*/ at::Tensor d_bias) {
auto _rules =
m.getRuleBook(outputSize, inputSize, filterSize, filterStride, true);
Int nActive = m.getNActive(outputSize);
d_input_features.resize_as_(input_features);
d_input_features.zero_();
auto iF = input_features.data<T>();
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int ip = input_features.size(1);
Int op = d_output_features.size(1);
auto w = weight.data<T>();
auto dw = d_weight.data<T>();
Int c = ip * op;
RULEBOOKITERATOR(dDeconvolution_backward_dW2<T>(iF, diF, doF, w, dw, rbB,
nHotB, ip, ip, op, op);
, w += c; dw += c;)
if (d_bias.numel()) {
auto db = d_bias.data<T>();
Convolution_bp_bias(doF, db, op, op, nActive);
}
}
// Copyright 2016-present, Facebook, Inc.
// All rights reserved.
//
// This source code is licensed under the license found in the
// LICENSE file in the root directory of this source tree.
#include "IOLayers.h"
template <typename T, Int Dimension>
void cuda_InputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
long batchSize, long mode) {
m.inputLayer(spatialSize, input_coords, batchSize, mode);
Int nPlanes = input_features.size(1);
auto &rules = m.inputLayerRuleBook;
Int maxActive = rules[0][1];
Int nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_InputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({rules[0][2], nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateOutput(Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = input_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
} else {
output_features.resize_({rules[0][2], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
}
}
template <typename T, Int Dimension>
void cuda_OutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.inputLayerRuleBook;
Int nPlanes = d_output_features.size(1);
auto mode = rules[0][0];
auto maxActive = rules[0][1];
auto nRows = rules[0][3];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
}
}
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateOutput(Metadata<Dimension> &m,
/*long*/ at::Tensor spatialSize,
/*long*/ at::Tensor input_coords,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features,
long mode) {
m.blLayer(spatialSize, input_coords, mode);
Int nPlanes = input_features.size(2);
output_features.resize_({*m.inputNActive, nPlanes});
output_features.zero_();
auto &rules = m.blLayerRuleBook;
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({*m.inputNActive, nPlanes});
} else {
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
iF, oF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_BLInputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = d_output_features.size(1);
Int mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
d_input_features.resize_({rules[0][2], rules[0][3], nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
diF, doF, nRows, maxActive, nPlanes, rb, mode == 4);
}
}
template <typename T, Int Dimension>
void cuda_BLOutputLayer_updateOutput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor input_features,
/*cuda float*/ at::Tensor output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = input_features.size(1);
auto mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
output_features.resize_as_(input_features);
output_features.copy_(input_features);
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
} else {
output_features.resize_({rules[0][2], rules[0][3], nPlanes});
output_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto iF = input_features.data<T>();
auto oF = output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_bp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
oF, iF, nRows, maxActive, nPlanes, rb, false);
}
}
template <typename T, Int Dimension>
void cuda_BLOutputLayer_updateGradInput(
Metadata<Dimension> &m,
/*cuda float*/ at::Tensor d_input_features,
/*cuda float*/ at::Tensor d_output_features) {
auto &rules = m.blLayerRuleBook;
Int nPlanes = d_output_features.size(2);
Int mode = rules[0][0];
Int maxActive = rules[0][1];
Int nRows = rules[0][4];
if (mode == 0) {
d_input_features.resize_as_(d_output_features);
d_input_features.copy_(d_output_features);
d_input_features.resize_({nRows, nPlanes});
} else {
d_input_features.resize_({nRows, nPlanes});
d_input_features.zero_();
auto rulesBuffer = at::CUDA(at_kINT).tensor({(int)rules[1].size()});
auto diF = d_input_features.data<T>();
auto doF = d_output_features.data<T>();
Int *rb = rulesBuffer.data<Int>();
cudaMemcpy(rb, &rules[1][0], sizeof(Int) * rules[1].size(),
cudaMemcpyHostToDevice);
InputLayer_fp<
T><<<std::min(nRows, (Int)32768), std::min(nPlanes, (Int)32)>>>(
doF, diF, nRows, maxActive, nPlanes, rb, false);
}
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment